diff options
Diffstat (limited to 'fs')
167 files changed, 4559 insertions, 3222 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 43dea3b00c29..8a2562e3a316 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1075,8 +1075,6 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, if (fc->ac.error < 0) return; - d_drop(new_dentry); - inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, newfid, newstatus, newcb, fc->cbi); if (IS_ERR(inode)) { @@ -1090,7 +1088,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); afs_vnode_commit_status(fc, vnode, 0); - d_add(new_dentry, inode); + d_instantiate(new_dentry, inode); } /* diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index d049cb459742..fde6b4d4121e 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -61,8 +61,11 @@ void afs_fileserver_probe_result(struct afs_call *call) afs_io_error(call, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -132,12 +135,14 @@ out: static int afs_do_probe_fileserver(struct afs_net *net, struct afs_server *server, struct key *key, - unsigned int server_index) + unsigned int server_index, + struct afs_error *_e) { struct afs_addr_cursor ac = { .index = 0, }; - int ret; + bool in_progress = false; + int err; _enter("%pU", &server->uuid); @@ -151,15 +156,17 @@ static int afs_do_probe_fileserver(struct afs_net *net, server->probe.rtt = UINT_MAX; for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - ret = afs_fs_get_capabilities(net, server, &ac, key, server_index, + err = afs_fs_get_capabilities(net, server, &ac, key, server_index, true); - if (ret != -EINPROGRESS) { - afs_fs_probe_done(server); - return ret; - } + if (err == -EINPROGRESS) + in_progress = true; + else + afs_prioritise_error(_e, err, ac.abort_code); } - return 0; + if (!in_progress) + afs_fs_probe_done(server); + return in_progress; } /* @@ -169,21 +176,23 @@ int afs_probe_fileservers(struct afs_net *net, struct key *key, struct afs_server_list *list) { struct afs_server *server; - int i, ret; + struct afs_error e; + bool in_progress = false; + int i; + e.error = 0; + e.responded = false; for (i = 0; i < list->nr_servers; i++) { server = list->servers[i].server; if (test_bit(AFS_SERVER_FL_PROBED, &server->flags)) continue; - if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) { - ret = afs_do_probe_fileserver(net, server, key, i); - if (ret) - return ret; - } + if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags) && + afs_do_probe_fileserver(net, server, key, i, &e)) + in_progress = true; } - return 0; + return in_progress ? 0 : e.error; } /* diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4c6d8e1112c2..6b17d3620414 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -382,7 +382,7 @@ void afs_zap_data(struct afs_vnode *vnode) int afs_validate(struct afs_vnode *vnode, struct key *key) { time64_t now = ktime_get_real_seconds(); - bool valid = false; + bool valid; int ret; _enter("{v={%llx:%llu} fl=%lx},%x", @@ -402,15 +402,21 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } else if (vnode->status.type == AFS_FTYPE_DIR && - test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && - vnode->cb_expires_at - 10 > now) { - valid = true; - } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && - vnode->cb_expires_at - 10 > now) { + (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) || + vnode->cb_expires_at - 10 <= now)) { + valid = false; + } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) || + vnode->cb_expires_at - 10 <= now) { + valid = false; + } else { valid = true; } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; + } else { + vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; + valid = false; } read_sequnlock_excl(&vnode->cb_lock); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5da3b09b7518..8871b9e8645f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -696,6 +696,14 @@ struct afs_interface { }; /* + * Error prioritisation and accumulation. + */ +struct afs_error { + short error; /* Accumulated error */ + bool responded; /* T if server responded */ +}; + +/* * Cursor for iterating over a server's address list. */ struct afs_addr_cursor { @@ -1015,6 +1023,7 @@ static inline void __afs_stat(atomic_t *s) * misc.c */ extern int afs_abort_to_error(u32); +extern void afs_prioritise_error(struct afs_error *, int, u32); /* * mntpt.c diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 700a5fa7f4ec..bbb1fd51b019 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -118,3 +118,55 @@ int afs_abort_to_error(u32 abort_code) default: return -EREMOTEIO; } } + +/* + * Select the error to report from a set of errors. + */ +void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) +{ + switch (error) { + case 0: + return; + default: + if (e->error == -ETIMEDOUT || + e->error == -ETIME) + return; + case -ETIMEDOUT: + case -ETIME: + if (e->error == -ENOMEM || + e->error == -ENONET) + return; + case -ENOMEM: + case -ENONET: + if (e->error == -ERFKILL) + return; + case -ERFKILL: + if (e->error == -EADDRNOTAVAIL) + return; + case -EADDRNOTAVAIL: + if (e->error == -ENETUNREACH) + return; + case -ENETUNREACH: + if (e->error == -EHOSTUNREACH) + return; + case -EHOSTUNREACH: + if (e->error == -EHOSTDOWN) + return; + case -EHOSTDOWN: + if (e->error == -ECONNREFUSED) + return; + case -ECONNREFUSED: + if (e->error == -ECONNRESET) + return; + case -ECONNRESET: /* Responded, but call expired. */ + if (e->responded) + return; + e->error = error; + return; + + case -ECONNABORTED: + e->responded = true; + e->error = afs_abort_to_error(abort_code); + return; + } +} diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 00504254c1c2..c3ae324781f8 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -136,7 +136,8 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = fc->vnode; - u32 rtt, abort_code; + struct afs_error e; + u32 rtt; int error = fc->ac.error, i; _enter("%lx[%d],%lx[%d],%d,%d", @@ -306,8 +307,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) if (fc->error != -EDESTADDRREQ) goto iterate_address; /* Fall through */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); fc->error = error; @@ -446,50 +450,15 @@ no_more_servers: if (fc->flags & AFS_FS_CURSOR_VBUSY) goto restart_from_beginning; - abort_code = 0; - error = -EDESTADDRREQ; + e.error = -EDESTADDRREQ; + e.responded = false; for (i = 0; i < fc->server_list->nr_servers; i++) { struct afs_server *s = fc->server_list->servers[i].server; - int probe_error = READ_ONCE(s->probe.error); - switch (probe_error) { - case 0: - continue; - default: - if (error == -ETIMEDOUT || - error == -ETIME) - continue; - case -ETIMEDOUT: - case -ETIME: - if (error == -ENOMEM || - error == -ENONET) - continue; - case -ENOMEM: - case -ENONET: - if (error == -ENETUNREACH) - continue; - case -ENETUNREACH: - if (error == -EHOSTUNREACH) - continue; - case -EHOSTUNREACH: - if (error == -ECONNREFUSED) - continue; - case -ECONNREFUSED: - if (error == -ECONNRESET) - continue; - case -ECONNRESET: /* Responded, but call expired. */ - if (error == -ECONNABORTED) - continue; - case -ECONNABORTED: - abort_code = s->probe.abort_code; - error = probe_error; - continue; - } + afs_prioritise_error(&e, READ_ONCE(s->probe.error), + s->probe.abort_code); } - if (error == -ECONNABORTED) - error = afs_abort_to_error(abort_code); - failed_set_error: fc->error = error; failed: @@ -553,8 +522,11 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) _leave(" = f [abort]"); return false; + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -633,6 +605,7 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(fc->vnode); if (fc->error == -EDESTADDRREQ || + fc->error == -EADDRNOTAVAIL || fc->error == -ENETUNREACH || fc->error == -EHOSTUNREACH) afs_dump_edestaddrreq(fc); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 59970886690f..a7b44863d502 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -576,6 +576,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, { signed long rtt2, timeout; long ret; + bool stalled = false; u64 rtt; u32 life, last_life; @@ -609,12 +610,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); if (timeout == 0 && - life == last_life && signal_pending(current)) + life == last_life && signal_pending(current)) { + if (stalled) break; + __set_current_state(TASK_RUNNING); + rxrpc_kernel_probe_life(call->net->socket, call->rxcall); + timeout = rtt2; + stalled = true; + continue; + } if (life != last_life) { timeout = rtt2; last_life = life; + stalled = false; } timeout = schedule_timeout(timeout); diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index c0f616bd70cb..f0b032976487 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -61,8 +61,11 @@ void afs_vlserver_probe_result(struct afs_call *call) afs_io_error(call, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -129,15 +132,17 @@ out: * Probe all of a vlserver's addresses to find out the best route and to * query its capabilities. */ -static int afs_do_probe_vlserver(struct afs_net *net, - struct afs_vlserver *server, - struct key *key, - unsigned int server_index) +static bool afs_do_probe_vlserver(struct afs_net *net, + struct afs_vlserver *server, + struct key *key, + unsigned int server_index, + struct afs_error *_e) { struct afs_addr_cursor ac = { .index = 0, }; - int ret; + bool in_progress = false; + int err; _enter("%s", server->name); @@ -151,15 +156,17 @@ static int afs_do_probe_vlserver(struct afs_net *net, server->probe.rtt = UINT_MAX; for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - ret = afs_vl_get_capabilities(net, &ac, key, server, + err = afs_vl_get_capabilities(net, &ac, key, server, server_index, true); - if (ret != -EINPROGRESS) { - afs_vl_probe_done(server); - return ret; - } + if (err == -EINPROGRESS) + in_progress = true; + else + afs_prioritise_error(_e, err, ac.abort_code); } - return 0; + if (!in_progress) + afs_vl_probe_done(server); + return in_progress; } /* @@ -169,21 +176,23 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, struct afs_vlserver_list *vllist) { struct afs_vlserver *server; - int i, ret; + struct afs_error e; + bool in_progress = false; + int i; + e.error = 0; + e.responded = false; for (i = 0; i < vllist->nr_servers; i++) { server = vllist->servers[i].server; if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) continue; - if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) { - ret = afs_do_probe_vlserver(net, server, key, i); - if (ret) - return ret; - } + if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) && + afs_do_probe_vlserver(net, server, key, i, &e)) + in_progress = true; } - return 0; + return in_progress ? 0 : e.error; } /* diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index b64a284b99d2..7adde83a0648 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -71,8 +71,9 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) { struct afs_addr_list *alist; struct afs_vlserver *vlserver; + struct afs_error e; u32 rtt; - int error = vc->ac.error, abort_code, i; + int error = vc->ac.error, i; _enter("%lx[%d],%lx[%d],%d,%d", vc->untried, vc->index, @@ -119,8 +120,11 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) goto failed; } + case -ERFKILL: + case -EADDRNOTAVAIL: case -ENETUNREACH: case -EHOSTUNREACH: + case -EHOSTDOWN: case -ECONNREFUSED: case -ETIMEDOUT: case -ETIME: @@ -235,50 +239,15 @@ no_more_servers: if (vc->flags & AFS_VL_CURSOR_RETRY) goto restart_from_beginning; - abort_code = 0; - error = -EDESTADDRREQ; + e.error = -EDESTADDRREQ; + e.responded = false; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - int probe_error = READ_ONCE(s->probe.error); - switch (probe_error) { - case 0: - continue; - default: - if (error == -ETIMEDOUT || - error == -ETIME) - continue; - case -ETIMEDOUT: - case -ETIME: - if (error == -ENOMEM || - error == -ENONET) - continue; - case -ENOMEM: - case -ENONET: - if (error == -ENETUNREACH) - continue; - case -ENETUNREACH: - if (error == -EHOSTUNREACH) - continue; - case -EHOSTUNREACH: - if (error == -ECONNREFUSED) - continue; - case -ECONNREFUSED: - if (error == -ECONNRESET) - continue; - case -ECONNRESET: /* Responded, but call expired. */ - if (error == -ECONNABORTED) - continue; - case -ECONNABORTED: - abort_code = s->probe.abort_code; - error = probe_error; - continue; - } + afs_prioritise_error(&e, READ_ONCE(s->probe.error), + s->probe.abort_code); } - if (error == -ECONNABORTED) - error = afs_abort_to_error(abort_code); - failed_set_error: vc->error = error; failed: @@ -341,6 +310,7 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) struct afs_net *net = vc->cell->net; if (vc->error == -EDESTADDRREQ || + vc->error == -EADDRNOTAVAIL || vc->error == -ENETUNREACH || vc->error == -EHOSTUNREACH) afs_vl_dump_edestaddrreq(vc); @@ -45,6 +45,7 @@ #include <asm/kmap_types.h> #include <linux/uaccess.h> +#include <linux/nospec.h> #include "internal.h" @@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) @@ -1436,6 +1438,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); + fput(req->ki_filp); return ret; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 68ebe188446a..78556447e1d5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node) } /* - * We maintain three seperate rbtrees: one for direct refs, one for + * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not * have a key. Each tree does merge on insertion. * @@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* - * Now it's a direct ref, put it in the the direct tree. We must + * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); @@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(slot); @@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item_size = btrfs_item_size_nr(eb, slot); @@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); offset++; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 97d91e55b70a..6f5d07415dab 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_ORDERED_DATA_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline bool is_data_inode(struct inode *inode) +{ + return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} + static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 2e43fba44035..b0c8094528d1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data( void *dstv, u32 offset, size_t len) { size_t cur; - size_t offset_in_page; + size_t pgoff; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(block_ctx->start); unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); + pgoff = offset_in_page(start_offset + offset); while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + cur = min(len, ((size_t)PAGE_SIZE - pgoff)); BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + offset_in_page, cur); + memcpy(dst, kaddr + pgoff, cur); dst += cur; len -= cur; - offset_in_page = 0; + pgoff = 0; i++; } } @@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; @@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) + if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -1778,7 +1778,7 @@ again: return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); + BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * write operations. Therefore it keeps the linkage * information for a block until a block is * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This + * and even circular linkage information. This * causes no harm unless such blocks are referenced * by the most recent super block. */ @@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->nodesize)) { pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->nodesize, PAGE_SIZE); return -1; } - if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->sectorsize)) { pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->sectorsize, PAGE_SIZE); return -1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2955a4ea2fa8..548057630b69 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode, */ static void end_compressed_bio_write(struct bio *bio) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - tree = &BTRFS_I(inode)->io_tree; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - tree->ops->writepage_end_io_hook(cb->compressed_pages[0], - cb->start, - cb->start + cb->len - 1, - NULL, - bio->bi_status ? - BLK_STS_OK : BLK_STS_NOTSUPP); + btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + cb->start, cb->start + cb->len - 1, + bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_SIZE - 1)); + WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; @@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, + 0); page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; @@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, - comp_bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, + comp_bio, 0); page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* * Shannon Entropy calculation * - * Pure byte distribution analysis fails to determine compressiability of data. + * Pure byte distribution analysis fails to determine compressibility of data. * Try calculating entropy to estimate the average minimum number of bits * needed to encode the sampled data. * @@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) { /* * Use 4 bits as radix base - * Use 16 u32 counters for calculating new possition in buf array + * Use 16 u32 counters for calculating new position in buf array * * @array - array that will be sorted * @array_buf - buffer array to store sorting results diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 539901fb5165..d92462fe66c8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "print-tree.h" #include "locking.h" +#include "volumes.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * * What is forced COW: * when we create snapshot during committing the transaction, - * after we've finished coping src root, we must COW the shared + * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ if (btrfs_header_generation(buf) == trans->transid && @@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; + if (test_bit(BTRFS_ROOT_DELETING, &root->state)) + btrfs_err(fs_info, + "COW'ing blocks on a fs root that's being dropped"); + if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", trans->transid, @@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); @@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete * it, therefore use right neighbor to insert the new item and - * no need to touch/dirty our left leaft. */ + * no need to touch/dirty our left leaf. */ btrfs_tree_unlock(left); free_extent_buffer(left); path->nodes[0] = right; @@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; @@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); if (left_level == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80953528572d..f031a447a047 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * File system states + * Runtime (in-memory) states of filesystem */ -#define BTRFS_FS_STATE_ERROR 0 -#define BTRFS_FS_STATE_REMOUNTING 1 -#define BTRFS_FS_STATE_TRANS_ABORTED 2 -#define BTRFS_FS_STATE_DEV_REPLACING 3 -#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, +}; #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -195,9 +208,10 @@ struct btrfs_root_backup { * it currently lacks any block count etc etc */ struct btrfs_super_block { - u8 csum[BTRFS_CSUM_SIZE]; /* the first 4 fields must match struct btrfs_header */ - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + u8 fsid[BTRFS_FSID_SIZE]; __le64 bytenr; /* this block number */ __le64 flags; @@ -234,8 +248,11 @@ struct btrfs_super_block { __le64 cache_generation; __le64 uuid_tree_generation; + /* the UUID written into btree blocks */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* future expansion */ - __le64 reserved[30]; + __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -265,7 +282,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES) + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -316,7 +334,7 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; +enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -360,9 +378,7 @@ struct btrfs_dev_replace { struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; - rwlock_t lock; - atomic_t blocking_readers; - wait_queue_head_t read_lock_wq; + struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; @@ -443,13 +459,19 @@ struct btrfs_space_info { struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; -#define BTRFS_BLOCK_RSV_GLOBAL 1 -#define BTRFS_BLOCK_RSV_DELALLOC 2 -#define BTRFS_BLOCK_RSV_TRANS 3 -#define BTRFS_BLOCK_RSV_CHUNK 4 -#define BTRFS_BLOCK_RSV_DELOPS 5 -#define BTRFS_BLOCK_RSV_EMPTY 6 -#define BTRFS_BLOCK_RSV_TEMP 7 +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; struct btrfs_block_rsv { u64 size; @@ -509,18 +531,18 @@ struct btrfs_free_cluster { }; enum btrfs_caching_type { - BTRFS_CACHE_NO = 0, - BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FAST = 2, - BTRFS_CACHE_FINISHED = 3, - BTRFS_CACHE_ERROR = 4, + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FAST, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, }; enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN = 0, - BTRFS_DC_ERROR = 1, - BTRFS_DC_CLEAR = 2, - BTRFS_DC_SETUP = 3, + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, }; struct btrfs_caching_control { @@ -712,41 +734,61 @@ struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; -#define BTRFS_FS_BARRIER 1 -#define BTRFS_FS_CLOSING_START 2 -#define BTRFS_FS_CLOSING_DONE 3 -#define BTRFS_FS_LOG_RECOVERING 4 -#define BTRFS_FS_OPEN 5 -#define BTRFS_FS_QUOTA_ENABLED 6 -#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 -#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 -#define BTRFS_FS_BTREE_ERR 11 -#define BTRFS_FS_LOG1_ERR 12 -#define BTRFS_FS_LOG2_ERR 13 -#define BTRFS_FS_QUOTA_OVERRIDE 14 -/* Used to record internally whether fs has been frozen */ -#define BTRFS_FS_FROZEN 15 - -/* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ -#define BTRFS_FS_EXCL_OP 16 - /* - * To info transaction_kthread we need an immediate commit so it doesn't - * need to wait for commit_interval + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. */ -#define BTRFS_FS_NEED_ASYNC_COMMIT 17 +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, + * ptr points to a struct btrfs_device. + */ + bool is_block_group; +}; -/* - * Indicate that balance has been set up from the ioctl and is in the main - * phase. The fs_info::balance_ctl is initialized. - */ -#define BTRFS_FS_BALANCE_RUNNING 18 +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + +enum { + BTRFS_FS_BARRIER, + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ + BTRFS_FS_EXCL_OP, + /* + * To info transaction_kthread we need an immediate commit so it + * doesn't need to wait for commit_interval + */ + BTRFS_FS_NEED_ASYNC_COMMIT, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, +}; struct btrfs_fs_info { - u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; @@ -790,6 +832,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; + /* block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -1114,6 +1158,10 @@ struct btrfs_fs_info { u32 sectorsize; u32 stripesize; + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1133,22 +1181,24 @@ struct btrfs_subvolume_writers { /* * The state of btrfs root */ -/* - * btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So IN_TRANS_SETUP - * is used to tell us when more checks are required - */ -#define BTRFS_ROOT_IN_TRANS_SETUP 0 -#define BTRFS_ROOT_REF_COWS 1 -#define BTRFS_ROOT_TRACK_DIRTY 2 -#define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 -#define BTRFS_ROOT_DEFRAG_RUNNING 5 -#define BTRFS_ROOT_FORCE_COW 6 -#define BTRFS_ROOT_MULTI_LOG_TASKS 7 -#define BTRFS_ROOT_DIRTY 8 +enum { + /* + * btrfs_record_root_in_trans is a multi-step process, and it can race + * with the balancing code. But the race is very small, and only the + * first time the root is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ + BTRFS_ROOT_IN_TRANS_SETUP, + BTRFS_ROOT_REF_COWS, + BTRFS_ROOT_TRACK_DIRTY, + BTRFS_ROOT_IN_RADIX, + BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + BTRFS_ROOT_DEFRAG_RUNNING, + BTRFS_ROOT_FORCE_COW, + BTRFS_ROOT_MULTI_LOG_TASKS, + BTRFS_ROOT_DIRTY, + BTRFS_ROOT_DELETING, +}; /* * in ram representation of the tree. extent_root is used for all allocations @@ -1274,6 +1324,9 @@ struct btrfs_root { u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + /* Number of active swapfiles */ + atomic_t nr_swapfiles; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2570,10 +2623,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID = 0, - BTRFS_REF_TYPE_BLOCK = 1, - BTRFS_REF_TYPE_DATA = 2, - BTRFS_REF_TYPE_ANY = 3, + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2599,7 +2652,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2713,10 +2766,12 @@ enum btrfs_reserve_flush_enum { enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + ALLOC_CHUNK = 7, + COMMIT_TRANS = 8, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2767,6 +2822,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3141,7 +3203,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, @@ -3150,9 +3212,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, unsigned *bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3163,6 +3232,9 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new, + struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, @@ -3186,6 +3258,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3425,6 +3503,16 @@ static inline void assfail(const char *expr, const char *file, int line) #define ASSERT(expr) ((void)0) #endif +/* + * Use that for functions that are conditionally exported for sanity tests but + * otherwise static + */ +#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define EXPORT_FOR_TESTS static +#else +#define EXPORT_FOR_TESTS +#endif + __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9301b3ad9217..cad36c99a483 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -400,6 +398,20 @@ again: return head; } +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + lockdep_assert_held(&head->lock); + + rb_erase_cached(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + atomic_dec(&delayed_refs->num_entries); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -453,7 +465,6 @@ inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } @@ -462,12 +473,14 @@ inserted: * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * versa we need to make sure to adjust pending_csums accordingly. */ if (existing->is_data) { - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + u64 csum_leaves = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_leaves; + } } spin_unlock(&existing->lock); } @@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, && head_ref->qgroup_reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, existing, head_ref, + update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } else { if (old_ref_mod) *old_ref_mod = 0; - if (head_ref->is_data && head_ref->ref_mod < 0) + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 8e20c5cb5404..d2af974f68a1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } - +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..8750c835f535 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found: BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; - dev_replace->replace_state = 0; dev_replace->time_started = 0; dev_replace->time_stopped = 0; atomic64_set(&dev_replace->num_write_errors, 0); @@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_mark_buffer_dirty(eb); @@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { @@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(src_device)) return PTR_ERR(src_device); + if (btrfs_pinned_by_swapfile(fs_info, src_device)) { + btrfs_warn_in_rcu(fs_info, + "cannot replace device %s (devid %llu) due to active swapfile", + btrfs_dev_name(src_device), src_device->devid); + return -ETXTBSY; + } + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); if (ret) @@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, } need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) { ret = PTR_ERR(trans); need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; @@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, leave: if (need_unlock) - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } @@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); /* * flush all outstanding I/O and inode extent mappings before the @@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); @@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_write_unlock(dev_replace); - + up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_remove_srcdev(src_device); @@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) @@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + up_write(&dev_replace->rwsem); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_write(&dev_replace->rwsem); + ret = btrfs_scrub_cancel(fs_info); + if (ret < 0) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* + * btrfs_dev_replace_finishing() will handle the + * cleanup part + */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + up_write(&dev_replace->rwsem); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + /* Scrub for replace must not be running in suspended state */ + ret = btrfs_scrub_cancel(fs_info); + ASSERT(ret != -ENOTCONN); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } @@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_write_unlock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); /* * This could collide with a paused balance, but the exclusive op logic @@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; @@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; @@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) * something that can happen if the dev_replace * procedure is suspended by an umount and then * the tgtdev is missing (or "btrfs dev scan") was - * not called and the the filesystem is remounted + * not called and the filesystem is remounted * in degraded state. This does not stop the * dev_replace procedure. It needs to be canceled * manually if the cancellation is wanted. @@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) -{ - read_lock(&dev_replace->lock); -} - -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) -{ - read_unlock(&dev_replace->lock); -} - -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) -{ -again: - wait_event(dev_replace->read_lock_wq, - atomic_read(&dev_replace->blocking_readers) == 0); - write_lock(&dev_replace->lock); - if (atomic_read(&dev_replace->blocking_readers)) { - write_unlock(&dev_replace->lock); - goto again; - } -} - -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) -{ - write_unlock(&dev_replace->lock); -} - -/* inc blocking cnt and release read lock */ -void btrfs_dev_replace_set_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - atomic_inc(&dev_replace->blocking_readers); - read_unlock(&dev_replace->lock); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { percpu_counter_inc(&fs_info->dev_replace.bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..4aa40bacc6cc 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0ab41da91d1..8da2f380d3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len = buf->len - offset; while (len > 0) { + /* + * Note: we don't need to check for the err == 1 case here, as + * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' + * and 'min_len = 32' and the currently implemented mapping + * algorithm we cannot cross a page boundary. + */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) @@ -477,9 +483,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, int mirror_num = 0; int failed_mirror = 0; - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { @@ -493,15 +499,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, break; } - /* - * This buffer's crc is fine, but its contents are corrupted, so - * there is no reason to read the other copies, they won't be - * any less wrong. - */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || - ret == -EUCLEAN) - break; - num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) @@ -551,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); return csum_tree_block(fs_info, eb, 0); @@ -566,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { - if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + u8 *metadata_uuid; + + /* + * Checking the incompat flag is only valid for the current + * fs. For seed devices it's forbidden to have their uuid + * changed so reading ->fsid in this case is fine + */ + if (fs_devices == fs_info->fs_devices && + btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; + + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { ret = 0; break; } @@ -669,19 +679,6 @@ out: return ret; } -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ - struct extent_buffer *eb; - - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = failed_mirror; - atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, -EIO); - return -EIO; /* we fixed nothing */ -} - static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; @@ -760,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work) async->status = ret; } +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; + struct inode *inode; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); + inode = async->private_data; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -773,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work) return; } - btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, + async->mirror_num, 1); + if (ret) { + async->bio->bi_status = ret; + bio_endio(async->bio); + } } static void run_one_async_free(struct btrfs_work *work) @@ -1187,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); + atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1664,9 +1678,8 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; int again; - struct btrfs_trans_handle *trans; - do { + while (1) { again = 0; /* Make the cleaner go to sleep early. */ @@ -1715,42 +1728,16 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) + return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop()) - schedule(); + schedule(); __set_current_state(TASK_RUNNING); } - } while (!kthread_should_stop()); - - /* - * Transaction kthread is stopped before us and wakes us up. - * However we might have started a new transaction and COWed some - * tree blocks when deleting unused block groups for example. So - * make sure we commit the transaction we started to have a clean - * shutdown when evicting the btree inode - if it has dirty pages - * when we do the final iput() on it, eviction will trigger a - * writeback for it which will fail with null pointer dereferences - * since work queues and other resources were already released and - * destroyed by the time the iput/eviction/writeback is made. - */ - trans = btrfs_attach_transaction(root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) - btrfs_err(fs_info, - "cleaner transaction attach returned %ld", - PTR_ERR(trans)); - } else { - int ret; - - ret = btrfs_commit_transaction(trans); - if (ret) - btrfs_err(fs_info, - "cleaner open transaction commit returned %d", - ret); } - - return 0; } static int transaction_kthread(void *arg) @@ -2154,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.blocking_readers, 0); + init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); - init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2478,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } @@ -2692,6 +2678,9 @@ int open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); @@ -2781,6 +2770,9 @@ int open_ctree(struct super_block *sb, fs_info->sectorsize = 4096; fs_info->stripesize = 4096; + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2817,11 +2809,29 @@ int open_ctree(struct super_block *sb, * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - memcpy(fs_info->super_for_commit, fs_info->super_copy, - sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + disk_super = fs_info->super_copy; + + ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)); + + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { + ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, + BTRFS_FSID_SIZE)); + } + + features = btrfs_super_flags(disk_super); + if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; + btrfs_set_super_flags(disk_super, features); + btrfs_info(fs_info, + "found metadata UUID change in progress flag, clearing"); + } + + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); ret = btrfs_validate_mount_super(fs_info); if (ret) { @@ -2830,7 +2840,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2942,7 +2951,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); + memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3091,7 +3100,7 @@ retry_root_backup: if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "writeable mount is not allowed due to too many missing devices"); + "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; } @@ -3760,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -3931,6 +3941,13 @@ void close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet + * because that frees the task_struct, and the transaction kthread might + * still try to wake up the cleaner. + */ + kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -3958,9 +3975,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) if (!sb_rdonly(fs_info->sb)) { /* - * If the cleaner thread is stopped and there are - * block groups queued for removal, the deletion will be - * skipped when we quit the cleaner thread. + * The cleaner kthread is stopped, so do one final pass over + * unused block groups. */ btrfs_delete_unused_bgs(fs_info); @@ -4061,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests - * enabled. Normal people shouldn't be using umapped buffers as dirty + * enabled. Normal people shouldn't be using unmapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) @@ -4359,13 +4375,26 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + struct extent_state *cached_state = NULL; + + /* + * The btrfs_finish_extent_commit() may get the same range as + * ours between find_first_extent_bit and clear_extent_dirty. + * Hence, hold the unused_bg_unpin_mutex to avoid double unpin + * the same extent range. + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); - if (ret) + EXTENT_DIRTY, &cached_state); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); + free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -4420,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4525,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, - - /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..987a64bc0c66 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,11 +21,11 @@ #define BTRFS_BDEV_BLOCKSIZE (4096) enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA = 0, - BTRFS_WQ_ENDIO_METADATA = 1, - BTRFS_WQ_ENDIO_FREE_SPACE = 2, - BTRFS_WQ_ENDIO_RAID56 = 3, - BTRFS_WQ_ENDIO_DIO_REPAIR = 4, + BTRFS_WQ_ENDIO_DATA, + BTRFS_WQ_ENDIO_METADATA, + BTRFS_WQ_ENDIO_FREE_SPACE, + BTRFS_WQ_ENDIO_RAID56, + BTRFS_WQ_ENDIO_DIO_REPAIR, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..b15afeae16df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -51,6 +51,24 @@ enum { CHUNK_ALLOC_FORCE = 2, }; +/* + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void update_##name(struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -1037,7 +1055,7 @@ out_free: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref btrfs_delayed_ref_unlock(head); } -static int cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; - int ret; if (!extent_op) - return 0; - head->extent_op = NULL; + return NULL; + if (head->must_insert_reserved) { + head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); - return 0; + return NULL; } + return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } +static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + int nr_items = 1; /* Dropping this ref head update. */ + + if (head->total_ref_mod < 0) { + struct btrfs_space_info *space_info; + u64 flags; + + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + + /* + * We had csum deletions accounted for in our delayed refs rsv, + * we need to drop the csum leaves for this update from our + * delayed_refs_rsv. + */ + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} + static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { @@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; - ret = cleanup_extent_op(trans, head); + ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { unselect_delayed_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - delayed_refs->num_heads--; - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); - atomic_dec(&delayed_refs->num_entries); - - trace_run_delayed_ref_head(fs_info, head, 0); - - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - } - } if (head->must_insert_reserved) { btrfs_pin_extent(fs_info, head->bytenr, @@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); + cleanup_ref_head_accounting(trans, head); + + trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return 0; @@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *global_rsv; - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; - u64 num_bytes, num_dirty_bgs_bytes; - int ret = 0; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - num_heads = heads_to_leaves(fs_info, num_heads); - if (num_heads > 1) - num_bytes += (num_heads - 1) * fs_info->nodesize; - num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * - fs_info->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, - num_dirty_bgs); - global_rsv = &fs_info->global_block_rsv; + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); /* - * If we can't allocate any more chunks lets make sure we have _lots_ of - * wiggle room since running delayed refs can create more delayed refs. + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. */ - if (global_rsv->space_info->full) { - num_dirty_bgs_bytes <<= 1; - num_bytes <<= 1; - } - - spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) - ret = 1; - spin_unlock(&global_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); return ret; } @@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans); + return btrfs_check_space_for_delayed_refs(trans->fs_info); } struct async_delayed_refs { @@ -3588,6 +3623,8 @@ again: */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { + bool drop_reserve = true; + cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); @@ -3660,6 +3697,7 @@ again: list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); + drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { @@ -3667,9 +3705,11 @@ again: } } - /* if its not on the io list, we need to put the block group */ + /* if it's not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); if (ret) break; @@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4256,7 +4297,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; + update_bytes_may_use(data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - if (WARN_ON(data_sinfo->bytes_may_use < len)) - data_sinfo->bytes_may_use = 0; - else - data_sinfo->bytes_may_use -= len; + update_bytes_may_use(data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. For raid56, the space info used + * space is actually usable. For raid56, the space info used * doesn't include the parity drive, so we don't have to * change the math */ @@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *trans; - u64 bytes; + u64 bytes_needed; + u64 reclaim_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) @@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes = (ticket) ? ticket->bytes : 0; + bytes_needed = (ticket) ? ticket->bytes : 0; spin_unlock(&space_info->lock); - if (!bytes) + if (!bytes_needed) return 0; /* See if there is enough pinned space to make this reservation */ if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; @@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size > bytes) - bytes = 0; - else - bytes -= delayed_rsv->size; + reclaim_bytes += delayed_rsv->reserved; spin_unlock(&delayed_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { return -ENOSPC; } @@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, list_del_init(&ticket->list); if (ticket->bytes && ticket->bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket->bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); } @@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); @@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, + to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -5407,7 +5549,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5435,7 +5577,7 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - space_info->bytes_may_use += ticket->bytes; + update_bytes_may_use(space_info, ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5443,7 +5585,7 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - space_info->bytes_may_use += num_bytes; + update_bytes_may_use(space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. - * @flush - the flusing restriction. + * @flush - the flushing restriction. * * Essentially the same as btrfs_block_rsv_refill, except it uses the * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the resreve + * or return if we already have enough space. This will also handle the reserve * tracepoint for the reserved amount. */ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, @@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = delayed_rsv; + + if (target->full || target == block_rsv) + target = global_rsv; + + if (block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; @@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, - &qgroup_to_release); + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); @@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; - if (global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, + num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; + update_bytes_may_use(sinfo, num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_may_use -= num_bytes; + update_bytes_may_use(sinfo, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; if (fs_info->quota_root) @@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} /* * To be called after all the new block groups attached to the transaction @@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; int factor; + int ret = 0; /* block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; + if (!cache) { + ret = -ENOENT; + break; + } factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); trans->transaction->num_dirty_bgs++; + trans->delayed_ref_updates++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, total -= num_bytes; bytenr += num_bytes; } - return 0; + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - space_info->bytes_may_use -= ram_bytes; + update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - space_info->bytes_pinned -= len; + update_bytes_pinned(space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - space_info->bytes_may_use += to_add; + update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) unpin = &fs_info->freed_extents[0]; while (!trans->aborted) { + struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; @@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); unpin_extent_range(fs_info, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); cond_resched(); } @@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - btrfs_free_delayed_extent_op(head->extent_op); - head->extent_op = NULL; - } + if (cleanup_extent_op(head) != NULL) + goto out; /* * waiting for the lock here would deadlock. If someone else has it @@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); - - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; + btrfs_delete_ref_head(delayed_refs, head); head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; + cleanup_ref_head_accounting(trans, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, } /* + * Structure used internally for find_free_extent() function. Wraps needed + * parameters. + */ +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; +}; + + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group_cache **cluster_bg_ret) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_block_group_cache *cluster_bg; + u64 aligned_cluster; + u64 offset; + int ret; + + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->key.objectid, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, + ffe_ctl->search_start, ffe_ctl->num_bytes); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } + + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } + + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, + ffe_ctl->search_start, ffe_ctl->num_bytes, + aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(bg, + ffe_ctl->search_start, + ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; + return 0; + } + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_clustered) { + spin_unlock(&last_ptr->refill_lock); + + ffe_ctl->retry_clustered = true; + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + ffe_ctl->empty_size); + return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group + */ +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl) +{ + u64 offset; + + /* + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; + + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); + return 1; + } + spin_unlock(&free_space_ctl->tree_lock); + } + + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); + + /* + * If we didn't find a chunk, and we haven't failed on this block group + * before, and this block group is in the middle of caching and we are + * ok with waiting, then go ahead and wait for progress to be made, and + * set @retry_unclustered to true. + * + * If @retry_unclustered is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && + ffe_ctl->loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); + ffe_ctl->retry_unclustered = true; + return -EAGAIN; + } else if (!offset) { + return 1; + } + ffe_ctl->found_offset = offset; + return 0; +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_free_cluster *last_ptr, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + int full_search, bool use_cluster) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret; + + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && + ffe_ctl->have_caching_bg) + return 1; + + if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) + return 1; + + if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } + return 0; + } + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any uncached bgs and we've already done a + * full search through. + */ + if (ffe_ctl->orig_have_caching_bg || !full_search) + ffe_ctl->loop = LOOP_CACHING_WAIT; + else + ffe_ctl->loop = LOOP_ALLOC_CHUNK; + } else { + ffe_ctl->loop++; + } + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = do_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + + /* Do not bail out on ENOSPC since we can do more. */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; +} + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position @@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * * If there is no suitable free space, we will record the max size of * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, @@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - u64 search_start = 0; - u64 max_extent_size = 0; - u64 max_free_space = 0; - u64 empty_cluster = 0; + struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - int loop = 0; - int index = btrfs_bg_flags_to_raid_index(flags); - bool failed_cluster_refill = false; - bool failed_alloc = false; bool use_cluster = true; - bool have_caching_bg = false; - bool orig_have_caching_bg = false; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.search_start = 0; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.delalloc = delalloc; + ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); + ffe_ctl.have_caching_bg = false; + ffe_ctl.orig_have_caching_bg = false; + ffe_ctl.found_offset = 0; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); + last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) @@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&last_ptr->lock); } - search_start = max(search_start, first_logical_byte(fs_info, 0)); - search_start = max(search_start, hint_byte); - if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(fs_info, search_start); + ffe_ctl.search_start = max(ffe_ctl.search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); + if (ffe_ctl.search_start == hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl.search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = btrfs_bg_flags_to_raid_index( + ffe_ctl.index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; @@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } } search: - have_caching_bg = false; - if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) + ffe_ctl.have_caching_bg = false; + if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || + ffe_ctl.index == 0) full_search = true; down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups[index], - list) { - u64 offset; - int cached; - + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl.index], list) { /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; btrfs_grab_block_group(block_group, delalloc); - search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->key.objectid; /* * this can happen if we end up cycling through all the @@ -7398,9 +7953,9 @@ search: } have_block_group: - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { - have_caching_bg = true; + ffe_ctl.cached = block_group_cache_done(block_group); + if (unlikely(!ffe_ctl.cached)) { + ffe_ctl.have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -7414,322 +7969,92 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *used_block_group; - unsigned long aligned_cluster; - /* - * the refill lock keeps out other - * people trying to start a new cluster - */ - used_block_group = btrfs_lock_cluster(block_group, - last_ptr, - delalloc); - if (!used_block_group) - goto refill_cluster; - - if (used_block_group != block_group && - (used_block_group->ro || - !block_group_bits(used_block_group, flags))) - goto release_cluster; - - offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, - num_bytes, - used_block_group->key.objectid, - &max_extent_size); - if (offset) { - /* we have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - used_block_group, - search_start, num_bytes); - if (used_block_group != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = used_block_group; - } - goto checks; - } - - WARN_ON(last_ptr->block_group != used_block_group); -release_cluster: - /* If we are on LOOP_NO_EMPTY_SIZE, we can't - * set up a new clusters, so lets just skip it - * and let the allocator find whatever block - * it can find. If we reach this point, we - * will have tried the cluster allocator - * plenty of times and not have found - * anything, so we are likely way too - * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - used_block_group != block_group) { - spin_unlock(&last_ptr->refill_lock); - btrfs_release_block_group(used_block_group, - delalloc); - goto unclustered_alloc; - } + struct btrfs_block_group_cache *cluster_bg = NULL; - /* - * this cluster didn't work out, free it and - * start over - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - - if (used_block_group != block_group) - btrfs_release_block_group(used_block_group, - delalloc); -refill_cluster: - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - aligned_cluster = max_t(unsigned long, - empty_cluster + empty_size, - block_group->full_stripe_len); + ret = find_free_extent_clustered(block_group, last_ptr, + &ffe_ctl, &cluster_bg); - /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(fs_info, block_group, - last_ptr, search_start, - num_bytes, - aligned_cluster); if (ret == 0) { - /* - * now pull our allocation out of this - * cluster - */ - offset = btrfs_alloc_from_cluster(block_group, - last_ptr, - num_bytes, - search_start, - &max_extent_size); - if (offset) { - /* we found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - block_group, search_start, - num_bytes); - goto checks; + if (cluster_bg && cluster_bg != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = cluster_bg; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT - && !failed_cluster_refill) { - spin_unlock(&last_ptr->refill_lock); - - failed_cluster_refill = true; - wait_block_group_cache_progress(block_group, - num_bytes + empty_cluster + empty_size); + goto checks; + } else if (ret == -EAGAIN) { goto have_block_group; - } - - /* - * at this point we either didn't find a cluster - * or we weren't able to allocate a block from our - * cluster. Free the cluster we've been trying - * to use, and go to the next block group - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } - -unclustered_alloc: - /* - * We are doing an unclustered alloc, set the fragmented flag so - * we don't bother trying to setup a cluster again until we get - * more space. - */ - if (unlikely(last_ptr)) { - spin_lock(&last_ptr->lock); - last_ptr->fragmented = 1; - spin_unlock(&last_ptr->lock); - } - if (cached) { - struct btrfs_free_space_ctl *ctl = - block_group->free_space_ctl; - - spin_lock(&ctl->tree_lock); - if (ctl->free_space < - num_bytes + empty_cluster + empty_size) { - max_free_space = max(max_free_space, - ctl->free_space); - spin_unlock(&ctl->tree_lock); + } else if (ret > 0) { goto loop; } - spin_unlock(&ctl->tree_lock); + /* ret == -ENOENT case falls through */ } - offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size, - &max_extent_size); - /* - * If we didn't find a chunk, and we haven't failed on this - * block group before, and this block group is in the middle of - * caching and we are ok with waiting, then go ahead and wait - * for progress to be made, and set failed_alloc to true. - * - * If failed_alloc is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !failed_alloc && !cached && - loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(block_group, - num_bytes + empty_size); - failed_alloc = true; + ret = find_free_extent_unclustered(block_group, last_ptr, + &ffe_ctl); + if (ret == -EAGAIN) goto have_block_group; - } else if (!offset) { + else if (ret > 0) goto loop; - } + /* ret == 0 case falls through */ checks: - search_start = round_up(offset, fs_info->stripesize); + ffe_ctl.search_start = round_up(ffe_ctl.found_offset, + fs_info->stripesize); /* move on to the next group */ - if (search_start + num_bytes > + if (ffe_ctl.search_start + num_bytes > block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } - if (offset < search_start) - btrfs_add_free_space(block_group, offset, - search_start - offset); + if (ffe_ctl.found_offset < ffe_ctl.search_start) + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = search_start; + ins->objectid = ffe_ctl.search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(block_group, search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, + num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: - failed_cluster_refill = false; - failed_alloc = false; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - index); + ffe_ctl.index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg - && !orig_have_caching_bg) - orig_have_caching_bg = true; - - if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) - goto search; - - if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, + full_search, use_cluster); + if (ret > 0) goto search; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { - index = 0; - if (loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (orig_have_caching_bg || !full_search) - loop = LOOP_CACHING_WAIT; - else - loop = LOOP_ALLOC_CHUNK; - } else { - loop++; - } - - if (loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; - - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); - - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - loop = LOOP_NO_EMPTY_SIZE; - - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - goto out; - } - - if (loop == LOOP_NO_EMPTY_SIZE) { - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (empty_size == 0 && - empty_cluster == 0) { - ret = -ENOSPC; - goto out; - } - empty_size = 0; - empty_cluster = 0; - } - - goto search; - } else if (!ins->objectid) { - ret = -ENOSPC; - } else if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } - ret = 0; - } -out: if (ret == -ENOSPC) { - if (!max_extent_size) - max_extent_size = max_free_space; + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl.max_extent_size) + ffe_ctl.max_extent_size = ffe_ctl.total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = max_extent_size; + space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); - ins->offset = max_extent_size; + ins->offset = ffe_ctl.max_extent_size; } return ret; } @@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); - write_extent_buffer_fsid(buf, fs_info->fsid); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different - * EXENT bit to differentiate dirty pages. + * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, @@ -8221,7 +8546,12 @@ again: goto again; } - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; - u32 blocksize; struct btrfs_key key; struct btrfs_key first_key; struct extent_buffer *next; @@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); btrfs_node_key_to_cpu(path->nodes[level], &first_key, path->slots[level]); - blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); if (!next) { @@ -8693,7 +9021,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; + /* + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. + */ + set_bit(BTRFS_ROOT_DELETING, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) } /* - * checks to see if its even possible to relocate this block group. + * Checks to see if it's even possible to relocate this block group. * * @return - -1 if it's not a good idea to relocate this block group, 0 if its * ok to go ahead and try. @@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _alot_ of + * the space in and be done with it. This saves us _a_lot_ of * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { @@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: + btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); } btrfs_trans_release_chunk_metadata(trans); @@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); set_avail_alloc_bits(fs_info, type); return 0; @@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; + bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); @@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_lock(&trans->transaction->cache_write_mutex); /* - * make sure our free spache cache IO is done before remove the + * Make sure our free space cache IO is done before removing the * free space inode */ spin_lock(&trans->transaction->dirty_bgs_lock); @@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); + remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); return ret; } @@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - space_info->bytes_pinned -= block_group->pinned; + update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!blk_queue_discard(bdev_get_queue(device->bdev))) return 0; - /* Not writeable = nothing to do. */ + /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d228f706ff3e..fc126b92ea59 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - if (tree->ops && tree->ops->check_extent_io_range) - tree->ops->check_extent_io_range(tree->private_data, caller, - start, end); + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode || !is_data_inode(inode)) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return tree_search_for_insert(tree, offset, NULL, NULL); } -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->private_data, new, other); -} - /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree, } } -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->private_data, state, bits); -} - static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits, struct extent_changeset *changeset); @@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->private_data, orig, split); -} - /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, { struct rb_node *node; - split_cb(tree, orig, split); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; } - clear_state_cb(tree, state, bits); + + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); state->state &= ~bits_to_clear; @@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree, unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; - set_state_cb(tree, state, bits); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; @@ -1459,16 +1452,16 @@ out: * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. @Start and @end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + * false if nothing was in the tree */ -static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1585,7 +1580,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1605,6 +1600,7 @@ again: /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1616,11 +1612,10 @@ again: loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1643,17 +1638,6 @@ out_failed: return found; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) -{ - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); -} -#endif - static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * This is a generic handler for readpage errors. If other copies exist, read + * those and write back good data to the failed position. Does not investigate + * in remapping the failed extent elsewhere, hoping the device will be smart + * enough to do this as needed */ - static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int failed_mirror) @@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); - struct extent_io_tree *tree; int ret = 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end, NULL, - uptodate); + btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + bool data_inode = btrfs_ino(BTRFS_I(inode)) + != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops)) { + if (likely(uptodate)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); @@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (tree->ops) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (ret == -EAGAIN) { - /* - * Data inode's readpage_io_failed_hook() always - * returns -EAGAIN. - * - * The generic bio_readpage_error handles errors - * the following way: If possible, new read - * requests are created and submitted and will - * end up in end_bio_extent_readpage as well (if - * we're lucky, not in the !uptodate case). In - * that case it returns 0 and we just go on with - * the next page in our bio. If it can't handle - * the error it will return -EIO and we remain - * responsible for that page. - */ - ret = bio_readpage_error(bio, offset, page, - start, end, mirror); - if (ret == 0) { - uptodate = !bio->bi_status; - offset += len; - continue; - } - } + if (data_inode) { /* - * metadata's readpage_io_failed_hook() always returns - * -EIO and fixes nothing. -EIO is also returned if - * data inode error could not be fixed. + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, + * not in the !uptodate case). In that case it returns + * 0 and we just go on with the next page in our bio. + * If it can't handle the error it will return -EIO and + * we remain responsible for that page. */ - ASSERT(ret == -EIO); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); + if (ret == 0) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } else { + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, + &eb->bflags)) + btree_readahead_hook(eb, -EIO); + + ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -2607,7 +2585,7 @@ readpage_ok: unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_SIZE-1); + off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); @@ -2644,8 +2622,7 @@ readpage_ok: if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, else contig = bio_end_sector(bio) == sector; - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, - bio, bio_flags)) + ASSERT(tree->ops); + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; @@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc, /* * helper for __extent_writepage, doing all of the delayed allocation setup. * - * This returns 1 if our fill_delalloc function did all the work required + * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has * been started and the page is already unlocked. * @@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct inode *inode, - struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, - u64 delalloc_start, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc, + u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; int page_started = 0; - if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) - return 0; while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + found = find_lock_delalloc_range(inode, tree, page, &delalloc_start, - &delalloc_end, - BTRFS_MAX_EXTENT_SIZE); - if (nr_delalloc == 0) { + &delalloc_end); + if (!found) { delalloc_start = delalloc_end + 1; continue; } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - nr_written, wbc); + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + delalloc_end, &page_started, nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); - /* fill_delalloc should be return < 0 for error - * but just in case, we use > 0 here meaning the - * IO is started, so we don't want to return > 0 - * unless things are going well. + /* + * btrfs_run_delalloc_range should return < 0 for error + * but just in case, we use > 0 here meaning the IO is + * started, so we don't want to return > 0 unless + * things are going well. */ ret = ret < 0 ? ret : -EIO; goto done; @@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, int nr = 0; bool compressed; - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); + ret = btrfs_writepage_cow_fixup(page, start, page_end); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); - unlock_page(page); - return 1; - } + update_nr_written(wbc, nr_written); + unlock_page(page); + return 1; } /* @@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, end = page_end; if (i_size <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); goto done; } @@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 offset; if (cur >= i_size) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, cur, + page_end, 1); break; } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, @@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, * end_io notification does not happen here for * compressed extents */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); + if (!compressed) + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, + 1); else if (compressed) { /* we don't want to end_page_writeback on * a compressed extent. this happens @@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_SIZE - 1); + pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); - ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); - if (ret == 1) - goto done_unlocked; - if (ret) - goto done; + if (!epd->extent_locked) { + ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + } ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, write_flags, &nr); @@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && @@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_SIZE - 1, - NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, + start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4118,42 +4094,36 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; unsigned long bio_flags = 0; struct page *pagepool[16]; - struct page *page; struct extent_map *em_cached = NULL; struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - page = list_entry(pages->prev, struct page, lru); + while (!list_empty(pages)) { + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { + struct page *page = list_entry(pages->prev, + struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - continue; + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + readahead_gfp_mask(mapping))) { + put_page(page); + continue; + } + + pagepool[nr++] = page; } - pagepool[nr++] = page; - if (nr < ARRAY_SIZE(pagepool)) - continue; __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); - nr = 0; + &bio_flags, &prev_em_start); } - if (nr) - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); - BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(bio, 0, bio_flags); return 0; @@ -4342,7 +4312,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, /* * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cahced one. + * fiemap extent won't overlap with cached one. * Not recoverable. * * NOTE: Physical address can overlap, due to compression @@ -4914,13 +4884,6 @@ again: check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); - /* - * We will free dummy extent buffer's if they come into - * free_extent_buffer with a ref count of 2, but if we are using this we - * want the buffers to stay in memory until we're done with them, so - * bump the ref count again. - */ - atomic_inc(&eb->refs); return eb; free_eb: btrfs_release_extent_buffer(eb); @@ -5102,7 +5065,9 @@ void free_extent_buffer(struct extent_buffer *eb) while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) @@ -5111,10 +5076,6 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) - atomic_dec(&eb->refs); - - if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5340,7 +5301,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; if (start + len > eb->len) { @@ -5350,7 +5311,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5375,14 +5336,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5413,10 +5374,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb, char **map, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_SIZE - 1); + size_t offset; char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_SHIFT; @@ -5453,14 +5414,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5509,13 +5470,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5539,13 +5500,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5584,13 +5545,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = (start_offset + dst_offset) & - (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5626,7 +5586,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5638,7 +5598,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, offset = start_offset + start + byte_offset; *page_index = offset >> PAGE_SHIFT; - *page_offset = offset & (PAGE_SIZE - 1); + *page_offset = offset_in_page(offset); } /** @@ -5780,7 +5740,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5798,10 +5758,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_offset); + src_off_in_page = offset_in_page(start_offset + src_offset); dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_SHIFT; @@ -5829,7 +5787,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5853,10 +5811,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, dst_i = (start_offset + dst_end) >> PAGE_SHIFT; src_i = (start_offset + src_end) >> PAGE_SHIFT; - dst_off_in_page = (start_offset + dst_end) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_end); + src_off_in_page = offset_in_page(start_offset + src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..9673be3f3d1f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -37,18 +37,22 @@ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_FLAG_SHIFT 16 -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ -#define EXTENT_BUFFER_UNMAPPED 9 -#define EXTENT_BUFFER_IN_TREE 10 -#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ +enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, + EXTENT_BUFFER_CORRUPT, + /* this got triggered by readahead */ + EXTENT_BUFFER_READAHEAD, + EXTENT_BUFFER_TREE_REF, + EXTENT_BUFFER_STALE, + EXTENT_BUFFER_WRITEBACK, + /* read IO error */ + EXTENT_BUFFER_READ_ERR, + EXTENT_BUFFER_UNMAPPED, + EXTENT_BUFFER_IN_TREE, + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, +}; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) @@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be allways defined, the function + * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ extent_submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - - /* - * Optional hooks, called if the pointer is not NULL - */ - int (*fill_delalloc)(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc); - - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate); - void (*set_bit_hook)(void *private_data, struct extent_state *state, - unsigned *bits); - void (*clear_bit_hook)(void *private_data, - struct extent_state *state, - unsigned *bits); - void (*merge_extent_hook)(void *private_data, - struct extent_state *new, - struct extent_state *other); - void (*split_extent_hook)(void *private_data, - struct extent_state *orig, u64 split); - void (*check_extent_io_range)(void *private_data, const char *caller, - u64 start, u64 end); }; struct extent_io_tree { @@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end) + u64 end, struct extent_state **cached) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL); + EXTENT_DO_ACCOUNTING, 0, 0, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); +bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7eea8b6e2cd3..a042a193c120 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em) return container_of(prev, struct extent_map, rb_node); } -/* helper for btfs_get_extent. Given an existing extent in the tree, +/* + * Helper for btrfs_get_extent. Given an existing extent in the tree, * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 31977ffd6190..ef05a0121652 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -11,13 +11,20 @@ #define EXTENT_MAP_INLINE ((u64)-2) #define EXTENT_MAP_DELALLOC ((u64)-1) -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ -#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ -#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ -#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ +/* bits for the extent_map::flags field */ +enum { + /* this entry not yet on disk, don't free it */ + EXTENT_FLAG_PINNED, + EXTENT_FLAG_COMPRESSED, + /* pre-allocated extent */ + EXTENT_FLAG_PREALLOC, + /* Logging this extent */ + EXTENT_FLAG_LOGGING, + /* Filling in a preallocated extent */ + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, +}; struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ba74827beb32..920bf3b4b0ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) -{ - kfree(bio->csum_allocated); -} - static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { @@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc_array(nblocks, - csum_size, GFP_NOFS); - if (!btrfs_bio->csum_allocated) { + btrfs_bio->csum = kmalloc_array(nblocks, csum_size, + GFP_NOFS); + if (!btrfs_bio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } - btrfs_bio->csum = btrfs_bio->csum_allocated; - btrfs_bio->end_io = btrfs_io_bio_endio_readpage; } else { btrfs_bio->csum = btrfs_bio->csum_inline; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a3c22e16509b..d38dc8c31533 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_SIZE - 1); + int offset = offset_in_page(pos); while (write_bytes > 0) { size_t count = min_t(size_t, @@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_SIZE - 1); + size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - @@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by settattr when we are about to truncate + * ordered_data_close is set by setattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. @@ -2089,8 +2089,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * Before we acquired the inode's lock, someone may have dirtied more + * pages in the target range. We need to make sure that writeback for + * any such pages does not start while we are logging the inode, because + * if it does, any of the following might happen when we are not doing a + * full inode sync: + * + * 1) We log an extent after its writeback finishes but before its + * checksums are added to the csum tree, leading to -EIO errors + * when attempting to read the extent after a log replay. + * + * 2) We can end up logging an extent before its writeback finishes. + * Therefore after the log replay we will have a file extent item + * pointing to an unwritten extent (and no data checksums as well). + * + * So trigger writeback for any eventual new dirty pages and then we + * wait for all ordered extents to complete below. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { + inode_unlock(inode); + goto out; + } + + /* * We have to do this here to avoid the priority inversion of waiting on - * IO of a lower priority task while holding a transaciton open. + * IO of a lower priority task while holding a transaction open. */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { @@ -2130,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit - * before we start blocking join'ers. This comment is to keep somebody + * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ @@ -2162,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); - /* - * If any of the ordered extents had an error, just return it to user - * space, so that the application knows some writes didn't succeed and - * can take proper action (retry for e.g.). Blindly committing the - * transaction in this case, would fool userspace that everything was - * successful. And we also want to make sure our log doesn't contain - * file extent items pointing to extents that weren't fully written to - - * just like in the non fast fsync path, where we check for the ordered - * operation's error flag before writing to the log tree and return -EIO - * if any of them had this flag set (btrfs_wait_ordered_range) - - * therefore we need to check for errors in the ordered operations, - * which are indicated by ctx.io_err. - */ - if (ctx.io_err) { - btrfs_end_transaction(trans); - ret = ctx.io_err; - goto out; - } - if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4ba0aedc878b..74aa552f4793 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget(fs_info->sb, &location, root, NULL); + inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); + btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; @@ -838,6 +839,25 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path->search_commit_root = 1; path->skip_locking = 1; + /* + * We must pass a path with search_commit_root set to btrfs_iget in + * order to avoid a deadlock when allocating extents for the tree root. + * + * When we are COWing an extent buffer from the tree root, when looking + * for a free extent, at extent-tree.c:find_free_extent(), we can find + * block group without its free space cache loaded. When we find one + * we must load its space cache which requires reading its free space + * cache's inode item from the root tree. If this inode item is located + * in the same leaf that we started COWing before, then we end up in + * deadlock on the extent buffer (trying to read lock it when we + * previously write locked it). + * + * It's safe to read the inode item using the commit root because + * block groups, once loaded, stay in memory forever (until they are + * removed) as well as their space caches once loaded. New block groups + * once created get their ->cached field set to BTRFS_CACHE_FINISHED so + * we will never try to read their inode item while the fs is mounted. + */ inode = lookup_free_space_inode(fs_info, block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d6736595ec57..e5089087eaa6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -74,11 +74,11 @@ out: return ret; } -struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path, int cow) +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( + struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) { struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; @@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } } +EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -315,6 +316,7 @@ out: return ret; } +EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -487,6 +489,7 @@ out: return ret; } +EXPORT_FOR_TESTS int free_space_test_bit(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 offset) { @@ -775,6 +778,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) @@ -968,6 +972,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3df5b52278c..43eb4535319d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -27,6 +27,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <linux/swap.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode, /* * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, - const u64 offset, - const u64 bytes) + struct page *locked_page, + u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* + * In case this page belongs to the delalloc range being instantiated + * then skip it, since the first page of a range is going to be + * properly cleaned up by the caller of run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + offset += PAGE_SIZE; + bytes -= PAGE_SIZE; + } + + return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_SIZE - 1); + offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); @@ -357,7 +368,7 @@ struct async_extent { struct async_cow { struct inode *inode; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; @@ -538,8 +549,7 @@ again: &total_compressed); if (!ret) { - unsigned long offset = total_compressed & - (PAGE_SIZE - 1); + unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; char *kaddr; @@ -847,14 +857,13 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages, async_cow->write_flags)) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->i_mapping; - tree->ops->writepage_end_io_hook(p, start, end, - NULL, 0); + btrfs_writepage_endio_finish_ordered(p, start, end, 0); + p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, end, NULL, 0, @@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) { struct btrfs_fs_info *fs_info; struct async_cow *async_cow; - struct btrfs_root *root; unsigned long nr_pages; async_cow = container_of(work, struct async_cow, work); - root = async_cow->root; - fs_info = root->fs_info; + fs_info = async_cow->fs_info; nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; @@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = igrab(inode); - async_cow->root = root; + async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; async_cow->write_flags = write_flags; @@ -1372,7 +1378,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -1531,12 +1538,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) { + if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; - cur_offset = end; - } if (cow_start != (u64)-1) { + cur_offset = end; ret = cow_file_range(inode, locked_page, cow_start, end, end, page_started, nr_written, 1, NULL); if (ret) @@ -1577,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) } /* - * extent_io.c call back to do delayed allocation processing + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time. */ -static int run_delalloc_range(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; @@ -1606,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, write_flags); } if (ret) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_page, start, + end - start + 1); return ret; } -static void btrfs_split_extent_hook(void *private_data, - struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) { - struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1626,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data, u64 new_size; /* - * See the explanation in btrfs_merge_extent_hook, the same + * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; @@ -1643,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data, } /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(void *private_data, - struct extent_state *new, - struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) { - struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1756,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, } /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(void *private_data, - struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits) { - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1810,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data, } /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. */ -static void btrfs_clear_bit_hook(void *private_data, - struct extent_state *state, - unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, + struct extent_state *state, unsigned *bits) { - struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1842,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data, /* * We don't reserve metadata space for space cache inodes so we - * don't need to call dellalloc_release_metadata if there is an + * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && @@ -1881,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data, } /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk + * + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio * return error otherwise */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1933,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; - - ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; -} - -/* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. * @@ -2057,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { - WARN_ON((end & (PAGE_SIZE - 1)) == 0); + WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, extra_bits, cached_state); } @@ -2153,7 +2135,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3160,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3570,10 +3552,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static int btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode, + struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_path *path; + struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3589,15 +3572,18 @@ static int btrfs_read_locked_inode(struct inode *inode) if (!ret) filled = true; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); return ret; } @@ -3683,6 +3669,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * <power failure> + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -3722,7 +3723,8 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -4440,31 +4442,6 @@ out: return err; } -static int truncate_space_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytes_deleted) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - /* - * This is only used to apply pressure to the enospc system, we don't - * intend to use this reservation at all. - */ - bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); - bytes_deleted *= fs_info->nodesize; - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, - bytes_deleted, 1); - trans->bytes_reserved += bytes_deleted; - } - return ret; - -} - /* * Return this if we need to call truncate_block for the last bit of the * truncate. @@ -4509,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; - bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4540,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed + * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4722,15 +4698,7 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans)) - btrfs_async_run_delayed_refs(fs_info, - trans->delayed_ref_updates * 2, - trans->transid, 0); if (be_nice) { - if (truncate_space_check(trans, root, - extent_num_bytes)) { - should_end = true; - } if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } @@ -4741,7 +4709,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle || should_end) { + should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4753,23 +4721,24 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); - if (should_throttle) { - unsigned long updates = trans->delayed_ref_updates; - if (updates) { - trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, - updates * 2); - if (ret) - break; - } - } + /* - * if we failed to refill our space rsv, bail out - * and let the transaction restart + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. */ - if (should_end) { - ret = -EAGAIN; - break; + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } } goto search_again; } else { @@ -4795,18 +4764,6 @@ out: } btrfs_free_path(path); - - if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { - unsigned long updates = trans->delayed_ref_updates; - int err; - - if (updates) { - trans->delayed_ref_updates = 0; - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) - ret = err; - } - } return ret; } @@ -5151,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the end less truncate */ + /* Disable nonlocked read DIO to avoid the endless truncate */ btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); @@ -5329,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) + if (!btrfs_check_space_for_delayed_refs(fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) return trans; /* If not, commit and try again. */ @@ -5644,8 +5601,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s, /* Get an inode object given its location and corresponding root. * Returns in *is_new if the inode was read from disk */ -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) +struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new, + struct btrfs_path *path) { struct inode *inode; @@ -5656,7 +5614,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, if (inode->i_state & I_NEW) { int ret; - ret = btrfs_read_locked_inode(inode); + ret = btrfs_read_locked_inode(inode, path); if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5678,6 +5636,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return inode; } +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new) +{ + return btrfs_iget_path(s, location, root, new, NULL); +} + static struct inode *new_simple_dir(struct super_block *s, struct btrfs_key *key, struct btrfs_root *root) @@ -6395,14 +6359,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } @@ -6614,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6641,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; - int drop_on_err = 0; u64 objectid = 0; u64 index = 0; @@ -6667,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; } - drop_on_err = 1; /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -6688,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; d_instantiate_new(dentry, inode); - drop_on_err = 0; out_fail: btrfs_end_transaction(trans); @@ -8042,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio) dio_bio->bi_status = err; dio_end_io(dio_bio); - - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(err)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -8087,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode, return; /* * Our bio might span multiple ordered extents. In this case - * we keep goin until we have accounted the whole dio. + * we keep going until we have accounted the whole dio. */ if (ordered_offset < offset + bytes) { ordered_bytes = offset + bytes - ordered_offset; @@ -8397,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (io_bio->end_io) - io_bio->end_io(io_bio, ret); + btrfs_io_bio_free_csum(io_bio); free_ordered: /* @@ -8901,7 +8865,7 @@ again: /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) - zero_start = size & ~PAGE_MASK; + zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; @@ -9146,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9957,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9985,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10018,7 +9986,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10026,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10055,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10434,26 +10402,6 @@ out: return ret; } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ - return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, - u64 start, u64 end) -{ - struct inode *inode = private_data; - u64 isize; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { struct inode *inode = tree->private_data; @@ -10470,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } } +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) + btrfs_put_block_group(sp->ptr); + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The EXCL_OP flag makes sure they aren't running/won't run + * concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap file + * is active and moving the extents. Note that this also prevents a + * concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + */ + atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group_cache *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10512,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - - /* optional callbacks */ - .fill_delalloc = run_delalloc_range, - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, - .check_extent_io_range = btrfs_check_extent_io_range, }; /* @@ -10547,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = { .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3ca6943827ef..fab9443f6a42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else if (fsflags & FS_COMPR_FL) { const char *comp; + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_unlock; + } + binode->flags |= BTRFS_INODE_COMPRESS; binode->flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + if (atomic_read(&root->nr_swapfiles)) { + btrfs_warn(fs_info, + "cannot snapshot subvolume with active swapfile"); + return -ETXTBSY; + } + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; @@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; @@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } inode_lock(inode); - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + } else { + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + } if (ret < 0) { inode_unlock(inode); goto out_ra; @@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } rcu_read_unlock(); - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); + memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -3191,92 +3206,6 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) -{ - struct page *page; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - - if (!PageUptodate(page)) { - int ret; - - ret = btrfs_readpage(NULL, page); - if (ret) - return ERR_PTR(ret); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - } - - return page; -} - -static int gather_extent_pages(struct inode *inode, struct page **pages, - int num_pages, u64 off) -{ - int i; - pgoff_t index = off >> PAGE_SHIFT; - - for (i = 0; i < num_pages; i++) { -again: - pages[i] = extent_same_get_page(inode, index + i); - if (IS_ERR(pages[i])) { - int err = PTR_ERR(pages[i]); - - if (err == -EAGAIN) - goto again; - pages[i] = NULL; - return err; - } - } - return 0; -} - -static int lock_extent_range(struct inode *inode, u64 off, u64 len, - bool retry_range_locking) -{ - /* - * Do any pending delalloc/csum calculations on inode, one way or - * another, and lock file content. - * The locking order is: - * - * 1) pages - * 2) range in the inode's io tree - */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(inode, - off + len - 1); - if ((!ordered || - ordered->file_offset + ordered->len <= off || - ordered->file_offset >= off + len) && - !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - if (!retry_range_locking) - return -EAGAIN; - btrfs_wait_ordered_range(inode, off, len); - } - return 0; -} - static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { inode_unlock(inode1); @@ -3292,259 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len, - bool retry_range_locking) -{ - int ret; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } - ret = lock_extent_range(inode1, loff1, len, retry_range_locking); - if (ret) - return ret; - ret = lock_extent_range(inode2, loff2, len, retry_range_locking); - if (ret) - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, - loff1 + len - 1); - return ret; -} - -struct cmp_pages { - int num_pages; - struct page **src_pages; - struct page **dst_pages; -}; - -static void btrfs_cmp_data_free(struct cmp_pages *cmp) -{ - int i; - struct page *pg; - - for (i = 0; i < cmp->num_pages; i++) { - pg = cmp->src_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->src_pages[i] = NULL; - } - pg = cmp->dst_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->dst_pages[i] = NULL; - } - } -} - -static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, - struct inode *dst, u64 dst_loff, - u64 len, struct cmp_pages *cmp) -{ - int ret; - int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - - cmp->num_pages = num_pages; - - ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); - if (ret) - goto out; - - ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); - -out: - if (ret) - btrfs_cmp_data_free(cmp); - return ret; -} - -static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -{ - int ret = 0; - int i; - struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_SIZE; - void *addr, *dst_addr; - - i = 0; - while (len) { - if (len < PAGE_SIZE) - cmp_len = len; - - BUG_ON(i >= cmp->num_pages); - - src_page = cmp->src_pages[i]; - dst_page = cmp->dst_pages[i]; - ASSERT(PageLocked(src_page)); - ASSERT(PageLocked(dst_page)); - - addr = kmap_atomic(src_page); - dst_addr = kmap_atomic(dst_page); - - flush_dcache_page(src_page); - flush_dcache_page(dst_page); - - if (memcmp(addr, dst_addr, cmp_len)) - ret = -EBADE; - - kunmap_atomic(addr); - kunmap_atomic(dst_addr); - - if (ret) - break; - - len -= cmp_len; - i++; - } - - return ret; -} - -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, - u64 olen) -{ - u64 len = *plen; - u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - - if (off + olen > inode->i_size || off + olen < off) - return -EINVAL; - - /* if we extend to eof, continue to block boundary */ - if (off + len == inode->i_size) - *plen = len = ALIGN(inode->i_size, bs) - off; - - /* Check that we are block aligned - btrfs_clone() requires this */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) - return -EINVAL; - - return 0; -} - static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff, - struct cmp_pages *cmp) + struct inode *dst, u64 dst_loff) { + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; u64 len = olen; - bool same_inode = (src == dst); - u64 same_lock_start = 0; - u64 same_lock_len = 0; - - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - return ret; - - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - return ret; - if (same_inode) { - /* - * Single inode case wants the same checks, except we - * don't want our length pushed out past i_size as - * comparing that data range makes no sense. - * - * extent_same_check_offsets() will do this for an - * unaligned length at i_size, so catch it here and - * reject the request. - * - * This effectively means we require aligned extents - * for the single-inode case, whereas the other cases - * allow an unaligned length so long as it ends at - * i_size. - */ - if (len != olen) - return -EINVAL; - - /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) - return -EINVAL; - - same_lock_start = min_t(u64, loff, dst_loff); - same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; - } else { - /* - * If the source and destination inodes are different, the - * source's range end offset matches the source's i_size, that - * i_size is not a multiple of the sector size, and the - * destination range does not go past the destination's i_size, - * we must round down the length to the nearest sector size - * multiple. If we don't do this adjustment we end replacing - * with zeroes the bytes in the range that starts at the - * deduplication range's end offset and ends at the next sector - * size multiple. - */ - if (loff + olen == i_size_read(src) && - dst_loff + len < i_size_read(dst)) { - const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; - - len = round_down(i_size_read(src), sz) - loff; - olen = len; - } - } - -again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); - if (ret) - return ret; - - if (same_inode) - ret = lock_extent_range(src, same_lock_start, same_lock_len, - false); - else - ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, - false); + if (loff + len == src->i_size) + len = ALIGN(src->i_size, bs) - loff; /* - * If one of the inodes has dirty pages in the respective range or - * ordered extents, we need to flush dellaloc and wait for all ordered - * extents in the range. We must unlock the pages and the ranges in the - * io trees to avoid deadlocks when flushing delalloc (requires locking - * pages) and when waiting for ordered extents to complete (they require - * range locking). + * For same inode case we don't want our length pushed out past i_size + * as comparing that data range makes no sense. + * + * This effectively means we require aligned extents for the single + * inode case, whereas the other cases allow an unaligned length so long + * as it ends at i_size. */ - if (ret == -EAGAIN) { - /* - * Ranges in the io trees already unlocked. Now unlock all - * pages before waiting for all IO to complete. - */ - btrfs_cmp_data_free(cmp); - if (same_inode) { - btrfs_wait_ordered_range(src, same_lock_start, - same_lock_len); - } else { - btrfs_wait_ordered_range(src, loff, len); - btrfs_wait_ordered_range(dst, dst_loff, len); - } - goto again; - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(cmp); - return ret; - } - - /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, cmp); - if (ret == 0) - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - - if (same_inode) - unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, - same_lock_start + same_lock_len - 1); - else - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + if (dst == src && len != olen) + return -EINVAL; - btrfs_cmp_data_free(cmp); + /* + * Lock destination range to serialize with concurrent readpages(). + */ + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); return ret; } @@ -3555,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - struct cmp_pages cmp; int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; - bool same_inode = (src == dst); u64 i, tail_len, chunk_count; - if (olen == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) + return -ETXTBSY; tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); if (chunk_count == 0) num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; - /* - * If deduping ranges in the same inode, locking rules make it - * mandatory to always lock pages in ascending order to avoid deadlocks - * with concurrent tasks (such as starting writeback/delalloc). - */ - if (same_inode && dst_loff < loff) - swap(loff, dst_loff); - - /* - * We must gather up all the pages before we initiate our extent - * locking. We use an array for the page pointers. Size of the array is - * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. - */ - cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - if (!cmp.src_pages || !cmp.dst_pages) { - ret = -ENOMEM; - goto out_free; - } - for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff, &cmp); + dst, dst_loff); if (ret) - goto out_free; + return ret; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3614,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff, &cmp); - -out_free: - kvfree(cmp.src_pages); - kvfree(cmp.dst_pages); - -out_unlock: - if (same_inode) - inode_unlock(src); - else - btrfs_double_inode_unlock(src, dst); + dst_loff); return ret; } @@ -4211,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 len = olen; u64 bs = fs_info->sb->s_blocksize; - int same_inode = src == inode; /* * TODO: @@ -4228,93 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - if (btrfs_root_readonly(root)) - return -EROFS; - - if (file_src->f_path.mnt != file->f_path.mnt || - src->i_sb != inode->i_sb) - return -EXDEV; - - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - return -EISDIR; - - if (!same_inode) { - btrfs_double_inode_lock(src, inode); - } else { - inode_lock(src); - } - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ + if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. + */ if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; - if (len == 0) { - ret = 0; - goto out_unlock; - } - - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; - - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (destoff + len > off && destoff < off + len) - goto out_unlock; - } - if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) - goto out_unlock; + return ret; } /* - * Lock the target range too. Right after we replace the file extent - * items in the fs tree (which now point to the cloned data), we might - * have a worker replace them with extent items relative to a write - * operation that was issued before this clone operation (i.e. confront - * with inode.c:btrfs_finish_ordered_io). + * Lock destination range to serialize with concurrent readpages(). */ - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - - ret = lock_extent_range(src, lock_start, lock_len, true); - } else { - ret = btrfs_double_extent_lock(src, off, inode, destoff, len, - true); - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - goto out_unlock; - } - + lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_end = max_t(u64, off, destoff) + len - 1; - - unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); - } else { - btrfs_double_extent_unlock(src, off, inode, destoff, len); - } + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. @@ -4322,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); -out_unlock: + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + if (same_inode) + inode_lock(inode_in); + else + btrfs_double_inode_lock(inode_in, inode_out); + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); if (!same_inode) - btrfs_double_inode_unlock(src, inode); + inode_dio_wait(inode_out); + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + goto out_unlock; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) + goto out_unlock; + + return 0; + + out_unlock: + if (same_inode) + inode_unlock(inode_in); else - inode_unlock(src); + btrfs_double_inode_unlock(inode_in, inode_out); + return ret; } @@ -4334,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) { - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; - ret = btrfs_extent_same(src, off, len, dst, destoff); - } else { + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - } + + if (same_inode) + inode_unlock(src_inode); + else + btrfs_double_inode_unlock(src_inode, dst_inode); + return ret < 0 ? ret : len; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a4cc178bee..90639140439f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -27,7 +27,7 @@ * Records the total size (including the header) of compressed data. * * 2. Segment(s) - * Variable size. Each segment includes one segment header, followd by data + * Variable size. Each segment includes one segment header, followed by data * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..6fde2b2741ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; - bool dec_pending_ordered = false; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) - dec_pending_ordered = true; spin_unlock_irq(&tree->lock); - /* - * The current running transaction is waiting on us, we need to let it - * know that we're complete and wake it up. - */ - if (dec_pending_ordered) { - struct btrfs_transaction *trans; - - /* - * The checks for trans are just a formality, it should be set, - * but if it isn't we don't want to deref/assert under the spin - * lock, so be nice and check if trans is set, but ASSERT() so - * if it isn't set a developer will notice. - */ - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - ASSERT(trans); - if (trans) { - if (atomic_dec_and_test(&trans->pending_ordered)) - wake_up(&trans->pending_wait); - btrfs_put_transaction(trans); - } - } - spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..fb9a161f0215 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -37,28 +37,31 @@ struct btrfs_ordered_sum { * rbtree, just before waking any waiters. It is used to indicate the * IO is done and any metadata is inserted into the tree. */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ - -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent - * has done its due diligence in updating - * the isize. */ -#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ - -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to - * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ +enum { + /* set when all the pages are written */ + BTRFS_ORDERED_IO_DONE, + /* set when removed from the tree */ + BTRFS_ORDERED_COMPLETE, + /* set when we want to write in place */ + BTRFS_ORDERED_NOCOW, + /* writing a zlib compressed extent */ + BTRFS_ORDERED_COMPRESSED, + /* set when writing to preallocated extent */ + BTRFS_ORDERED_PREALLOC, + /* set when we're doing DIO with this extent */ + BTRFS_ORDERED_DIRECT, + /* We had an io error when writing this out */ + BTRFS_ORDERED_IOERR, + /* + * indicates whether this ordered extent has done its due diligence in + * updating the isize + */ + BTRFS_ORDERED_UPDATED_ISIZE, + /* Set when we have to truncate an extent */ + BTRFS_ORDERED_TRUNCATED, + /* Regular IO for COW */ + BTRFS_ORDERED_REGULAR, +}; struct btrfs_ordered_extent { /* logical offset in the file */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 45868fd76209..4e473a998219 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@ * - sync * - copy also limits on subvol creation * - limit - * - caches fuer ulists + * - caches for ulists * - performance benchmarks * - check all ioctl parameters */ @@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) __del_qgroup_rb(qgroup); } /* - * we call btrfs_free_qgroup_config() when umounting + * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * - * In this case, all exclsuive extents will also be exlusive for parent, so + * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to @@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. - * They should be marked during preivous (@dst_level = 1) iteration. + * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * - * This function can free us from keeping two pathes, thus later we only need + * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, @@ -1895,7 +1901,7 @@ out: * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. - * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, @@ -2020,7 +2026,7 @@ out: * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, * and skip all tree blocks whose generation is smaller than last_snapshot. * * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), @@ -2659,7 +2665,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int i; u64 *i_qgroups; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; @@ -2669,6 +2675,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; @@ -3103,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } - extent_buffer_get(scratch_leaf); - btrfs_tree_read_lock(scratch_leaf); - btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3131,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, goto out; } out: - if (scratch_leaf) { - btrfs_tree_read_unlock_blocking(scratch_leaf); + if (scratch_leaf) free_extent_buffer(scratch_leaf); - } if (done && !ret) { ret = 1; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d8f78f5ab854..20c6bd5fa701 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record { * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { - BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_DATA, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, @@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type { * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while - * *currently* meta is just reserve-and-clear during transcation. + * *currently* meta is just reserve-and-clear during transaction. * * TODO: Add new type for reservation which can survive transaction commit. - * Currect metadata reservation behavior is not suitable for such case. + * Current metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..e74455eb42f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1980,7 +1980,7 @@ cleanup_io: * - In case of single failure, where rbio->failb == -1: * * Cache this rbio iff the above read reconstruction is - * excuted without problems. + * executed without problems. */ if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dec14b739b10..10d9589001a9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + /* Insert extent in reada tree + all per-device trees, all or nothing */ + down_read(&fs_info->dev_replace.rwsem); ret = radix_tree_preload(GFP_KERNEL); - if (ret) + if (ret) { + up_read(&fs_info->dev_replace.rwsem); goto error; + } - /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_read_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } radix_tree_preload_end(); @@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); if (!have_zone) goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d69fbfb30aa9..c3557c12656b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -43,7 +43,7 @@ struct ref_entry { * back to the delayed ref action. We hold the ref we are changing in the * action so we can account for the history properly, and we record the root we * were called with since it could be different from ref_root. We also store - * stack traces because thats how I roll. + * stack traces because that's how I roll. */ struct ref_action { int action; @@ -56,7 +56,7 @@ struct ref_action { /* * One of these for every block we reference, it holds the roots and references - * to it as well as all of the ref actions that have occured to it. We never + * to it as well as all of the ref actions that have occurred to it. We never * free it until we unmount the file system in order to make sure re-allocations * are happening properly. */ @@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * This shouldn't happen because we will add our re * above when we lookup the be with !parent, but just in * case catch this case so we don't panic because I - * didn't thik of some other corner case. + * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", root->root_key.objectid, be->bytenr); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..272b287f8cf0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * @@ -3959,6 +3959,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans); + trans = NULL; continue; } @@ -4184,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0, "raid0"); - DESCRIBE_FLAG(RAID1, "raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5, "raid5"); - DESCRIBE_FLAG(RAID6, "raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* @@ -4222,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { + struct btrfs_block_group_cache *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4230,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + bg = btrfs_lookup_block_group(fs_info, group_start); + if (!bg) + return -ENOENT; + + if (btrfs_pinned_by_swapfile(fs_info, bg)) { + btrfs_put_block_group(bg); + return -ETXTBSY; + } + rc = alloc_reloc_control(); - if (!rc) + if (!rc) { + btrfs_put_block_group(bg); return -ENOMEM; + } rc->extent_root = extent_root; - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); + rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group); if (ret) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..6dcd36d7b849 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock( } } - /* Insert new lock */ + /* + * Insert new lock. + */ ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) scrub_free_ctx(sctx); } -static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + struct btrfs_fs_info *fs_info, int is_dev_replace) { struct scrub_ctx *sctx; int i; - struct btrfs_fs_info *fs_info = dev->fs_info; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) @@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; - sctx->fs_info = dev->fs_info; + sctx->fs_info = fs_info; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int page_num; int success; bool full_stripe_locked; + unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) dev = sblock_to_check->pagev[0]->dev; /* + * We must use GFP_NOFS because the scrub task might be waiting for a + * worker task executing this function and in turn a transaction commit + * might be waiting the scrub task to pause (which needs to wait for all + * the worker tasks to complete before pausing). + * We do allocations in the workers through insert_full_stripe_lock() + * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * this function. + */ + nofs_flag = memalloc_nofs_save(); + /* * For RAID5/6, race can happen for a different device scrub thread. * For data corruption, Parity and Data threads will both try * to recovery the data. @@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); if (ret < 0) { + memalloc_nofs_restore(nofs_flag); spin_lock(&sctx->stat_lock); if (ret == -ENOMEM) sctx->stat.malloc_errors++; @@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_NOFS); + sizeof(*sblocks_for_recheck), GFP_KERNEL); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1202,6 +1215,7 @@ out: } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + memalloc_nofs_restore(nofs_flag); if (ret < 0) return ret; return 0; @@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks - * that started dellaloc right before we set the block + * that started delalloc right before we set the block * group to RO mode, as they might have just allocated * an extent from it or decided they could do a nocow * write. And if any such tasks did that, wait for their @@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache); @@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&fs_info->dev_replace.rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + unsigned int nofs_flag; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; + ret = -ENODEV; + goto out_free_ctx; } if (!is_dev_replace && !readonly && @@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); - return -EROFS; + ret = -EROFS; + goto out_free_ctx; } mutex_lock(&fs_info->scrub_lock); @@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EIO; + ret = -EIO; + goto out_free_ctx; } - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EINPROGRESS; + ret = -EINPROGRESS; + goto out_free_ctx; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return ret; + goto out_free_ctx; } - sctx = scrub_setup_ctx(dev, is_dev_replace); - if (IS_ERR(sctx)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); - return PTR_ERR(sctx); - } sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); + /* + * In order to avoid deadlock with reclaim when there is a transaction + * trying to pause scrub, make sure we use GFP_NOFS for all the + * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * invoked by our callees. The pausing request is done when the + * transaction commit starts, and it blocks the transaction until scrub + * is paused (done at specific points at scrub_stripe() or right above + * before incrementing fs_info->scrubs_running). + */ + nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { /* * by holding device list mutex, we can @@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_put_ctx(sctx); return ret; + +out_free_ctx: + scrub_free_ctx(sctx); + + return ret; } void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 094cc1444a90..1b15b43905f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2238,7 +2238,7 @@ out: * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * - * When do we have have orphan inodes: + * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) kfree(m); } -static void tail_append_pending_moves(struct pending_dir_move *moves, +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves, list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } } static int apply_children_dir_moves(struct send_ctx *sctx) @@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) return 0; INIT_LIST_HEAD(&stack); - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); @@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); } return 0; @@ -3849,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We may have refs where the parent directory does not exist * yet. This happens if the parent directories inum is higher - * the the current inum. To handle this case, we create the + * than the current inum. To handle this case, we create the * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ @@ -4770,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_MASK; + unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; key.objectid = sctx->cur_ino; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b362b45dd757..368a5b9e6c13 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno) /* * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the approciate error response. + * invokes the appropriate error response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, @@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * although there is no way to update the progress. It would add the * risk of a deadlock, therefore the canceling is omitted. The only * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writeable + * completes. The next time when the filesystem is mounted writable * again, the device replace operation continues. */ } @@ -1848,7 +1848,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "too many missing devices, writeable remount is not allowed"); + "too many missing devices, writable remount is not allowed"); ret = -EACCES; goto restore; } @@ -1916,7 +1916,7 @@ restore: } /* Used to sort the devices by max_avail(descending sort) */ -static int btrfs_cmp_device_free_bytes(const void *dev_info1, +static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { if (((struct btrfs_device_info *)dev_info1)->max_avail > @@ -1945,8 +1945,8 @@ static inline void btrfs_descending_sort_devices( * The helper to calc the free space on the devices that can be used to store * file data. */ -static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, - u64 *free_bytes) +static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, + u64 *free_bytes) { struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2090,7 +2090,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; @@ -2237,6 +2237,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2311,7 +2312,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * device_list_mutex here as we only read the device data and the list * is protected by RCU. Even if a device is deleted during the list * traversals, we'll get valid data, the freeing callback will wait at - * least until until the rcu_read_unlock. + * least until the rcu_read_unlock. */ rcu_read_lock(); cur_devices = fs_info->fs_devices; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..5a5930e3d32b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { @@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj, BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%pU\n", + fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), + BTRFS_ATTR_PTR(, metadata_uuid), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c6ee600aff89..40716b357c1d 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT = 0, + FEAT_COMPAT, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index db72b3b6209e..8a59597f1883 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) + if (root->node) { + /* One for allocate_extent_buffer */ free_extent_buffer(root->node); + } kfree(root); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..3c46d7f23456 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = SZ_256M; - u64 max_bytes = SZ_128M; + /* In this test we need at least 2 file extents at its maximum size */ + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + u64 total_dirty = 2 * max_bytes; u64 start, end, test_start; - u64 found; + bool found; int ret = -EINVAL; test_msg("running find delalloc tests"); @@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, inode); + extent_io_tree_init(&tmp, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 64043f028820..af0c8e30d9e2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } - /* - * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidentally release our page. - */ - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); BTRFS_I(inode)->root = root; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1eeef9ec5da..127fa1535f58 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -233,14 +233,12 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); - atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); @@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; - + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, enforce_qgroups); if (ret) return ERR_PTR(ret); + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do 2 x num_items + * worth of delayed refs updates in this trans handle, and + * refill that amount for whatever is missing in the reserve. + */ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + if (delayed_refs_rsv->full == 0) { + delayed_refs_bytes = num_bytes; + num_bytes <<= 1; + } + /* * Do the reservation for the relocation root creation */ @@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - num_bytes, flush); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } @@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) /* * btrfs_attach_transaction_barrier() - catch the running transaction * - * It is similar to the above function, the differentia is this one + * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ @@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans)) + if (btrfs_check_space_for_delayed_refs(fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - int updates; - int err; smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED || cur_trans->delayed_refs.flushing) return 1; - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) /* Error code will also eval true */ - return err; - } - return should_end_transaction(trans); } @@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - u64 transid = trans->transid; - unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - int must_run_delayed_refs = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans); - trans->delayed_ref_updates = 0; - if (!trans->sync) { - must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans); - cur = max_t(unsigned long, cur, 32); - - /* - * don't make the caller wait if they are from a NOLOCK - * or ATTACH transaction, it will deadlock with commit - */ - if (must_run_delayed_refs == 1 && - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) - must_run_delayed_refs = 2; - } - - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); if (lock && should_end_transaction(trans) && @@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(info, cur, transid, - must_run_delayed_refs == 1); - } return err; } @@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return 0; /* - * Ensure dirty @src will be commited. Or, after comming + * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. @@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - DEFINE_WAIT(wait); WARN_ON(refcount_read(&trans->use_count) > 1); @@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ - wait_event(cur_trans->pending_wait, - atomic_read(&cur_trans->pending_ordered) == 0); -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(fs_info); - btrfs_wait_pending_ordered(cur_trans); - btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 4cbb1b55387d..f1ba78949d1b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,13 +12,13 @@ #include "ctree.h" enum btrfs_trans_state { - TRANS_STATE_RUNNING = 0, - TRANS_STATE_BLOCKED = 1, - TRANS_STATE_COMMIT_START = 2, - TRANS_STATE_COMMIT_DOING = 3, - TRANS_STATE_UNBLOCKED = 4, - TRANS_STATE_COMPLETED = 5, - TRANS_STATE_MAX = 6, + TRANS_STATE_RUNNING, + TRANS_STATE_BLOCKED, + TRANS_STATE_COMMIT_START, + TRANS_STATE_COMMIT_DOING, + TRANS_STATE_UNBLOCKED, + TRANS_STATE_COMPLETED, + TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 @@ -39,7 +39,6 @@ struct btrfs_transaction { */ atomic_t num_writers; refcount_t use_count; - atomic_t pending_ordered; unsigned long flags; @@ -51,7 +50,6 @@ struct btrfs_transaction { time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; - wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; struct list_head switch_commits; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index cab0b1f1f741..a62e1e837a89 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,10 +27,10 @@ * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. - * It's recommened to decode key.objecitd/offset if it's + * It's recommended to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error - * @bad_value: optional, it's recommened to output bad value and its + * @bad_value: optional, it's recommended to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed @@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, } /* - * Support for new compression/encrption must introduce incompat flag, + * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { @@ -389,13 +389,11 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, /* * Here we don't really care about alignment since extent allocator can - * handle it. We care more about the size, as if one block group is - * larger than maximum size, it's must be some obvious corruption. + * handle it. We care more about the size. */ - if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) { + if (key->offset == 0) { block_group_err(fs_info, leaf, slot, - "invalid block group size, have %llu expect (0, %llu]", - key->offset, BTRFS_MAX_DATA_CHUNK_SIZE); + "invalid block group size 0"); return -EUCLEAN; } @@ -440,7 +438,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) { block_group_err(fs_info, leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_SYSTEM, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e07f3376b7df..ac232b3d6d7e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1144,7 +1144,7 @@ next: } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { @@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer @@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(log->fs_info, ret, NULL); } - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - } - + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); kfree(log); } @@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4392,10 +4379,25 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + /* + * Skip extents outside our logging range. It's important to do + * it for correctness because if we don't ignore them, we may + * log them before their ordered extent completes, and therefore + * we could log them without logging their respective checksums + * (the checksum items are added to the csum tree at the very + * end of btrfs_finish_ordered_io()). Also leave such extents + * outside of our range in the list, since we may have another + * ranged fsync in the near future that needs them. If an extent + * outside our range corresponds to a hole, log it to avoid + * leaving gaps between extents (fsck will complain when we are + * not using the NO_HOLES feature). + */ + if ((em->start > end || em->start + em->len <= start) && + em->block_start != EXTENT_MAP_HOLE) + continue; + list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4417,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(&inode->vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5761,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 767765031e59..0fab84a8f670 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -15,7 +15,6 @@ struct btrfs_log_ctx { int log_ret; int log_transid; - int io_err; bool log_new_dentries; struct inode *inode; struct list_head list; @@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, { ctx->log_ret = 0; ctx->log_transid = 0; - ctx->io_err = 0; ctx->log_new_dentries = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..2576b1a379c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, @@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, @@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, @@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, - .ncopies = 2, + .ncopies = 1, + .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, - .ncopies = 3, + .ncopies = 1, + .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, @@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); @@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * @@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void) /* * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devs; @@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + if (metadata_fsid) + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); + else if (fsid) + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + return fs_devs; } @@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, return NULL; } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; + ASSERT(fsid); + + if (metadata_fsid) { + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(metadata_fsid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + } + + /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; + if (metadata_fsid) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 + && memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } else { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } } return NULL; } @@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_brelse; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = 1; } else { @@ -744,6 +863,51 @@ error_brelse: } /* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + return fs_devices; + } + } + + return NULL; +} + + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but curently device didn't + * observe it. Meaning our fsid will be different than theirs. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) { + return fs_devices; + } + } + + return NULL; +} +/* * Add new device to list of registered devices * * Returns: @@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path, bool *new_device_added) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) { + /* + * When we have an image which has CHANGING_FSID_V2 set + * it might belong to either a filesystem which has + * disks with completed fsid change or it might belong + * to fs with no UUID changes in effect, handle both. + */ + fs_devices = find_fsid_inprogress(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } else { + fs_devices = find_fsid_changed(disk_super); + } + } else if (has_metadata_uuid) { + fs_devices = find_fsid(disk_super->fsid, + disk_super->metadata_uuid); + } else { + fs_devices = find_fsid(disk_super->fsid, NULL); + } + - fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + fs_devices = alloc_fs_devices(disk_super->fsid, + disk_super->metadata_uuid); + else + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + + fs_devices->fsid_change = fsid_change_in_progress; + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = find_device(fs_devices, devid, disk_super->dev_item.uuid); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (has_metadata_uuid && fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + fs_devices->fsid_change = false; + } } if (!device) { @@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); @@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, * it back. We need it to pick the disk with largest generation * (as above). */ - if (!fs_devices->opened) + if (!fs_devices->opened) { device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = alloc_fs_devices(orig->fsid); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, p = kmap(*page); /* align our pointer to the offset of the super block */ - *disk_super = p + (bytenr & ~PAGE_MASK); + *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { @@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return num_devices; } @@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, goto out; } + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + rcu_str_deref(device->name), device->devid); + ret = -ETXTBSY; + goto out; + } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; @@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path( disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + device = btrfs_find_device(fs_info, devid, dev_uuid, + disk_super->metadata_uuid); + else + device = btrfs_find_device(fs_info, devid, + dev_uuid, disk_super->fsid); + brelse(bh); if (!device) device = ERR_PTR(-ENOENT); @@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; - seed_devices = alloc_fs_devices(NULL); + seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); @@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); mutex_unlock(&fs_devices->device_list_mutex); @@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * so rename the fsid on the sysfs */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fsid); + fs_info->fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_info, "sysfs: failed to create fsid for sprout"); @@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the @@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(&fs_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(&fs_devices->device_list_mutex); @@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; int slot; int ret; @@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_sys = 0; int chunk_reserved = 0; - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = btrfs_device_get_total_bytes(device); - size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || - btrfs_device_get_total_bytes(device) - - btrfs_device_get_bytes_used(device) > size_to_free || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - if (ret) { - /* btrfs_shrink_device never returns ret > 0 */ - WARN_ON(ret > 0); - goto error; - } - - trans = btrfs_start_transaction(dev_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_info_in_rcu(fs_info, - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - ret = btrfs_grow_device(trans, device, old_size); - if (ret) { - btrfs_end_transaction(trans); - /* btrfs_grow_device never returns ret > 0 */ - WARN_ON(ret > 0); - btrfs_info_in_rcu(fs_info, - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - btrfs_end_transaction(trans); - } - - /* step two, relocate all the chunks */ path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3638,10 +3848,15 @@ again: ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto error; if (ret == -ENOSPC) { enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; @@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) { + int index = btrfs_bg_flags_to_raid_index(bargs->target); + + CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); + } + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); +} + +/* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { @@ -3887,10 +4273,8 @@ static int balance_kthread(void *data) int ret = 0; mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "balance: resuming"); + if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); - } mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4433,10 +4817,16 @@ again: ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } } while (key.offset-- > 0); if (failed && !retried) { @@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int devs_min; /* min devs needed */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to + store parity information */ int ret; u64 max_stripe_size; u64 max_chunk_size; u64 stripe_size; - u64 num_bytes; + u64 chunk_size; int ndevs; int i; int j; @@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; + nparity = btrfs_raid_array[index].nparity; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; @@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(1); } - /* we don't want a chunk larger than 10% of writeable space */ + /* We don't want a chunk larger than 10% of writable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); @@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * this will have to be fixed for RAID1 and RAID10 over * more drives */ - data_stripes = num_stripes / ncopies; - - if (type & BTRFS_BLOCK_GROUP_RAID5) - data_stripes = num_stripes - 1; - - if (type & BTRFS_BLOCK_GROUP_RAID6) - data_stripes = num_stripes - 2; + data_stripes = (num_stripes - nparity) / ncopies; /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } @@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - num_bytes = stripe_size * data_stripes; + chunk_size = stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, chunk_size); em = alloc_extent_map(); if (!em) { @@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = num_bytes; + em->len = chunk_size; em->block_start = 0; em->block_len = em->len; em->orig_block_len = stripe_size; @@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) { - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); - } + for (i = 0; i < map->num_stripes; i++) + btrfs_device_set_bytes_used(map->stripes[i].dev, + map->stripes[i].dev->bytes_used + stripe_size); atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em = get_chunk_map(fs_info, chunk_offset, chunk_size); + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(em)) return PTR_ERR(em); @@ -4971,10 +5355,10 @@ out: } /* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the * bootstrap process of adding storage to a seed btrfs. */ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) @@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) int miss_ndevs = 0; int i; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) return 1; @@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) /* * We could return errors for these cases, but that could get @@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return ret; } @@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, /* discard always return a bbio */ ASSERT(bbio_ret); - em = get_chunk_map(fs_info, logical, length); + em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = get_chunk_map(fs_info, logical, *length); + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, *length = em->len - offset; } - /* This is for when we're called from btrfs_merge_bio_hook() and all - it cares about is the length */ + /* + * This is for when we're called from btrfs_bio_fits_in_stripe and all + * it cares about is the length + */ if (!bbio_ret) goto out; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ if (!dev_replace_is_ongoing) - btrfs_dev_replace_read_unlock(dev_replace); - else - btrfs_dev_replace_set_lock_blocking(dev_replace); + up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - btrfs_dev_replace_read_lock(dev_replace); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); - btrfs_dev_replace_read_unlock(dev_replace); + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); } free_extent_map(em); return ret; @@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 rmap_len; int i, j, nr = 0; - em = get_chunk_map(fs_info, chunk_start, 1); + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); if (IS_ERR(em)) return -EIO; @@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev) { - bio_io_error(bio); - return; - } - /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { btrfsic_submit_bio(bio); @@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, + &dev->dev_state) || (bio_op(first_bio) == REQ_OP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); @@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { + !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { device = find_device(cur_devices, devid, uuid); if (device) return device; @@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = fs_devices->seed; } - fs_devices = find_fsid(fsid); + fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid); + fs_devices = alloc_fs_devices(fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + "chunk %llu missing %d devices, max tolerance is %d for writable mount", em->start, missing, max_tolerated); free_extent_map(em); ret = false; @@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; struct extent_map *em; struct map_lookup *map; + struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; @@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, physical_offset, devid); ret = -EUCLEAN; } + + /* Make sure no dev extent is beyond device bondary */ + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find devid %llu", devid); + ret = -EUCLEAN; + goto out; + } + if (physical_offset + physical_len > dev->disk_total_bytes) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", + devid, physical_offset, physical_len, + dev->disk_total_bytes); + ret = -EUCLEAN; + goto out; + } out: free_extent_map(em); return ret; @@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -7541,3 +7952,27 @@ out: btrfs_free_path(path); return ret; } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..ed806649a473 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + bool fsid_change; struct list_head fs_list; u64 num_devices; @@ -218,6 +220,10 @@ struct btrfs_fs_devices { u64 missing_devices; u64 total_rw_bytes; u64 total_devices; + + /* Highest generation number of seen devices */ + u64 latest_generation; + struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex @@ -261,15 +267,12 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ -typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned int mirror_num; unsigned int stripe_index; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - u8 *csum_allocated; - btrfs_io_bio_end_io_t *end_io; struct bvec_iter iter; /* * This member must come last, bio_alloc_bioset will allocate enough @@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) return container_of(bio, struct btrfs_io_bio, bio); } +static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +{ + if (io_bio->csum != io_bio->csum_inline) { + kfree(io_bio->csum); + io_bio->csum = NULL; + } +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); - struct btrfs_bio { refcount_t refs; atomic_t stripes_pending; @@ -331,6 +339,8 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to store + * parity information */ int mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ @@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); @@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 95983c744164..1645fcfd9691 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -244,11 +244,13 @@ wait_for_old_object: ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); - cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry); + cache->cache.ops->put_object(&xobject->fscache, + (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_retry); goto try_again; requeue: - cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); + cache->cache.ops->put_object(&xobject->fscache, + (enum fscache_obj_ref_trace)cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; } @@ -336,7 +338,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, try_again: /* first step is to make up a grave dentry in the graveyard */ sprintf(nbuffer, "%08x%08x", - (uint32_t) get_seconds(), + (uint32_t) ktime_get_real_seconds(), (uint32_t) atomic_inc_return(&cache->gravecounter)); /* do the multiway lock magic */ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 40f7595aad10..8a577409d030 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -535,7 +535,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { + put_page(backpage); + backpage = NULL; put_page(netpage); + netpage = NULL; fscache_retrieval_complete(op, 1); continue; } @@ -608,7 +611,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { + put_page(backpage); + backpage = NULL; put_page(netpage); + netpage = NULL; fscache_retrieval_complete(op, 1); continue; } @@ -962,11 +968,8 @@ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) __releases(&object->fscache.cookie->lock) { struct cachefiles_object *object; - struct cachefiles_cache *cache; object = container_of(_object, struct cachefiles_object, fscache); - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); _enter("%p,{%lu}", object, page->index); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 0a29a00aed2e..511e6c68156a 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -135,7 +135,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(dentry); + if (!dentry) + return -ESTALE; _enter("%p,#%d", object, auxdata->len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 27cad84dab23..189df668b6a0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1931,10 +1931,17 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, if (!prealloc_cf) return -ENOMEM; - /* Start by sync'ing the source file */ + /* Start by sync'ing the source and destination files */ ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); - if (ret < 0) + if (ret < 0) { + dout("failed to write src file (%zd)\n", ret); + goto out; + } + ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); + if (ret < 0) { + dout("failed to write dst file (%zd)\n", ret); goto out; + } /* * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 67a9aeb2f4ec..bd13a3267ae0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -80,12 +80,8 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; - if (features & CEPH_FEATURE_DIRLAYOUTHASH) - ceph_decode_copy_safe(p, end, &info->dir_layout, - sizeof(info->dir_layout), bad); - else - memset(&info->dir_layout, 0, sizeof(info->dir_layout)); - + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; @@ -3182,10 +3178,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.pagelist = pagelist; if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) recon_state.msg_version = 3; - else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK) - recon_state.msg_version = 2; else - recon_state.msg_version = 1; + recon_state.msg_version = 2; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 32d4f13784ba..03f4d24db8fe 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -237,7 +237,8 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, ceph_put_snap_realm(mdsc, realm); realm = next; } - ceph_put_snap_realm(mdsc, realm); + if (realm) + ceph_put_snap_realm(mdsc, realm); up_read(&mdsc->snap_rwsem); return exceeded; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b5ecd6f50360..4e9a7cc488da 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -563,8 +563,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif - if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) - seq_puts(m, ",nocopyfrom"); + if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) + seq_puts(m, ",copyfrom"); if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c005a5400f2e..79a265ba9200 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -42,7 +42,9 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ -#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE +#define CEPH_MOUNT_OPT_DEFAULT \ + (CEPH_MOUNT_OPT_DCACHE | \ + CEPH_MOUNT_OPT_NOCOPYFROM) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index abcd78e332fe..85dadb93c992 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -133,7 +133,7 @@ config CIFS_XATTR config CIFS_POSIX bool "CIFS POSIX Extensions" - depends on CIFS_XATTR + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3713d22b95a7..907e85d65bb4 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -174,7 +174,7 @@ cifs_bp_rename_retry: cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); - full_path[dfsplen] = '\\'; + full_path[dfsplen] = dirsep; for (i = 0; i < pplen-1; i++) if (full_path[dfsplen+1+i] == '/') full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 74c33d5fafc8..c9bc56b1baac 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2541,14 +2541,13 @@ static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { - int wait_retry = 0; unsigned int wsize, credits; int rc; struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server; /* - * Try to resend this wdata, waiting for credits up to 3 seconds. + * Wait for credits to resend this wdata. * Note: we are attempting to resend the whole wdata not in segments */ do { @@ -2556,19 +2555,13 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, server, wdata->bytes, &wsize, &credits); if (rc) - break; + goto out; if (wsize < wdata->bytes) { add_credits_and_wake_if(server, credits, 0); msleep(1000); - wait_retry++; } - } while (wsize < wdata->bytes && wait_retry < 3); - - if (wsize < wdata->bytes) { - rc = -EBUSY; - goto out; - } + } while (wsize < wdata->bytes); rc = -EAGAIN; while (rc == -EAGAIN) { @@ -3234,14 +3227,13 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { - int wait_retry = 0; unsigned int rsize, credits; int rc; struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server; /* - * Try to resend this rdata, waiting for credits up to 3 seconds. + * Wait for credits to resend this rdata. * Note: we are attempting to resend the whole rdata not in segments */ do { @@ -3249,24 +3241,13 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, &rsize, &credits); if (rc) - break; + goto out; if (rsize < rdata->bytes) { add_credits_and_wake_if(server, credits, 0); msleep(1000); - wait_retry++; } - } while (rsize < rdata->bytes && wait_retry < 3); - - /* - * If we can't find enough credits to send this rdata - * release the rdata and return failure, this will pass - * whatever I/O amount we have finished to VFS. - */ - if (rsize < rdata->bytes) { - rc = -EBUSY; - goto out; - } + } while (rsize < rdata->bytes); rc = -EAGAIN; while (rc == -EAGAIN) { diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 9e7ef7ec2d70..a8999f930b22 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -97,7 +97,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto finished; - smb2_set_next_command(server, &rqst[num_rqst++]); + smb2_set_next_command(server, &rqst[num_rqst++], 0); /* Operation */ switch (command) { @@ -111,7 +111,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 0); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_DELETE: @@ -127,14 +127,14 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = si_iov; rqst[num_rqst].rq_nvec = 1; - size[0] = 8; + size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 1); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -149,7 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 0); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_INFO: @@ -165,7 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 0); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_RENAME: @@ -189,7 +189,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 0); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_HARDLINK: @@ -213,7 +213,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(server, &rqst[num_rqst], 0); smb2_set_related(&rqst[num_rqst++]); break; default: diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 225fec1cfa67..e25c7aade98a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1194,7 +1194,7 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); if (rc) goto iqinf_exit; - smb2_set_next_command(ses->server, &rqst[0]); + smb2_set_next_command(ses->server, &rqst[0], 0); /* Query */ memset(&qi_iov, 0, sizeof(qi_iov)); @@ -1208,7 +1208,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.output_buffer_length, buffer); if (rc) goto iqinf_exit; - smb2_set_next_command(ses->server, &rqst[1]); + smb2_set_next_command(ses->server, &rqst[1], 0); smb2_set_related(&rqst[1]); /* Close */ @@ -1761,16 +1761,23 @@ smb2_set_related(struct smb_rqst *rqst) char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; void -smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst) +smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst, + bool has_space_for_padding) { struct smb2_sync_hdr *shdr; unsigned long len = smb_rqst_len(server, rqst); /* SMB headers in a compound are 8 byte aligned. */ if (len & 7) { - rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; - rqst->rq_iov[rqst->rq_nvec].iov_len = 8 - (len & 7); - rqst->rq_nvec++; + if (has_space_for_padding) { + len = rqst->rq_iov[rqst->rq_nvec - 1].iov_len; + rqst->rq_iov[rqst->rq_nvec - 1].iov_len = + (len + 7) & ~7; + } else { + rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; + rqst->rq_iov[rqst->rq_nvec].iov_len = 8 - (len & 7); + rqst->rq_nvec++; + } len = smb_rqst_len(server, rqst); } @@ -1820,7 +1827,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &srch_path); if (rc) goto qfs_exit; - smb2_set_next_command(server, &rqst[0]); + smb2_set_next_command(server, &rqst[0], 0); memset(&qi_iov, 0, sizeof(qi_iov)); rqst[1].rq_iov = qi_iov; @@ -1833,7 +1840,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, NULL); if (rc) goto qfs_exit; - smb2_set_next_command(server, &rqst[1]); + smb2_set_next_command(server, &rqst[1], 0); smb2_set_related(&rqst[1]); memset(&close_iov, 0, sizeof(close_iov)); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 9f4e9ed9ce53..2fe78acd7d0c 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -117,7 +117,8 @@ extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst); extern void smb2_set_next_command(struct TCP_Server_Info *server, - struct smb_rqst *rqst); + struct smb_rqst *rqst, + bool has_space_for_padding); extern void smb2_set_related(struct smb_rqst *rqst); /* @@ -98,12 +98,6 @@ static void *dax_make_entry(pfn_t pfn, unsigned long flags) return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static void *dax_make_page_entry(struct page *page) -{ - pfn_t pfn = page_to_pfn_t(page); - return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); -} - static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; @@ -116,12 +110,12 @@ static unsigned int dax_entry_order(void *entry) return 0; } -static int dax_is_pmd_entry(void *entry) +static unsigned long dax_is_pmd_entry(void *entry) { return xa_to_value(entry) & DAX_PMD; } -static int dax_is_pte_entry(void *entry) +static bool dax_is_pte_entry(void *entry) { return !(xa_to_value(entry) & DAX_PMD); } @@ -222,9 +216,8 @@ static void *get_unlocked_entry(struct xa_state *xas) ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = xas_load(xas); - if (!entry || xa_is_internal(entry) || - WARN_ON_ONCE(!xa_is_value(entry)) || + entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !dax_is_locked(entry)) return entry; @@ -239,6 +232,34 @@ static void *get_unlocked_entry(struct xa_state *xas) } } +/* + * The only thing keeping the address space around is the i_pages lock + * (it's cycled in clear_inode() after removing the entries from i_pages) + * After we call xas_unlock_irq(), we cannot touch xas->xa. + */ +static void wait_entry_unlocked(struct xa_state *xas, void *entry) +{ + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + wq = dax_entry_waitqueue(xas, entry, &ewait.key); + prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + xas_unlock_irq(xas); + schedule(); + finish_wait(wq, &ewait.wait); + + /* + * Entry lock waits are exclusive. Wake up the next waiter since + * we aren't sure we will acquire the entry lock and thus wake + * the next waiter up on unlock. + */ + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &ewait.key); +} + static void put_unlocked_entry(struct xa_state *xas, void *entry) { /* If we were the only waiter woken, wake the next one */ @@ -255,6 +276,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) { void *old; + BUG_ON(dax_is_locked(entry)); xas_reset(xas); xas_lock_irq(xas); old = xas_store(xas, entry); @@ -352,16 +374,27 @@ static struct page *dax_busy_page(void *entry) return NULL; } -bool dax_lock_mapping_entry(struct page *page) +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * @page: The page whose entry we want to lock + * + * Context: Process context. + * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could + * not be locked. + */ +dax_entry_t dax_lock_page(struct page *page) { XA_STATE(xas, NULL, 0); void *entry; + /* Ensure page->mapping isn't freed while we look at it */ + rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(page->mapping); - if (!dax_mapping(mapping)) - return false; + entry = NULL; + if (!mapping || !dax_mapping(mapping)) + break; /* * In the device-dax case there's no need to lock, a @@ -370,8 +403,9 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ + entry = (void *)~0UL; if (S_ISCHR(mapping->host->i_mode)) - return true; + break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); @@ -382,20 +416,20 @@ bool dax_lock_mapping_entry(struct page *page) xas_set(&xas, page->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { - entry = get_unlocked_entry(&xas); - /* Did the page move while we slept? */ - if (dax_to_pfn(entry) != page_to_pfn(page)) { - xas_unlock_irq(&xas); - continue; - } + rcu_read_unlock(); + wait_entry_unlocked(&xas, entry); + rcu_read_lock(); + continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); - return true; + break; } + rcu_read_unlock(); + return (dax_entry_t)entry; } -void dax_unlock_mapping_entry(struct page *page) +void dax_unlock_page(struct page *page, dax_entry_t cookie) { struct address_space *mapping = page->mapping; XA_STATE(xas, &mapping->i_pages, page->index); @@ -403,7 +437,7 @@ void dax_unlock_mapping_entry(struct page *page) if (S_ISCHR(mapping->host->i_mode)) return; - dax_unlock_entry(&xas, dax_make_page_entry(page)); + dax_unlock_entry(&xas, (void *)cookie); } /* @@ -445,11 +479,9 @@ static void *grab_mapping_entry(struct xa_state *xas, retry: xas_lock_irq(xas); entry = get_unlocked_entry(xas); - if (xa_is_internal(entry)) - goto fallback; if (entry) { - if (WARN_ON_ONCE(!xa_is_value(entry))) { + if (!xa_is_value(entry)) { xas_set_err(xas, EIO); goto out_unlock; } @@ -1628,8 +1660,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || (order == 0 && !dax_is_pte_entry(entry)) || - (order == PMD_ORDER && (xa_is_internal(entry) || - !dax_is_pmd_entry(entry)))) { + (order == PMD_ORDER && !dax_is_pmd_entry(entry))) { put_unlocked_entry(&xas, entry); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, diff --git a/fs/direct-io.c b/fs/direct-io.c index 722d17c88edb..41a0e97252ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -325,8 +325,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) */ dio->iocb->ki_pos += transferred; - if (dio->op == REQ_OP_WRITE) - ret = generic_write_sync(dio->iocb, transferred); + if (ret > 0 && dio->op == REQ_OP_WRITE) + ret = generic_write_sync(dio->iocb, ret); dio->iocb->ki_complete(dio->iocb, ret, 0); } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 645158dc33f1..c69927bed4ef 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -77,7 +77,7 @@ static bool dentry_connected(struct dentry *dentry) struct dentry *parent = dget_parent(dentry); dput(dentry); - if (IS_ROOT(dentry)) { + if (dentry == parent) { dput(parent); return false; } @@ -147,6 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); + err = PTR_ERR(tmp); goto out_err; } if (tmp != dentry) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index cb91baa4275d..eb11502e3fcd 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -892,6 +892,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_magic != EXT2_SUPER_MAGIC) goto cantfind_ext2; + opts.s_mount_opt = 0; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT2_DEFM_DEBUG) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 62d9a659a8ff..dd8f10db82e9 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -612,9 +612,9 @@ skip_replace: } cleanup: - brelse(bh); if (!(bh && header == HDR(bh))) kfree(header); + brelse(bh); up_write(&EXT2_I(inode)->xattr_sem); return error; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 05f01fbd9c7f..22a9d8159720 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5835,9 +5835,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + put_bh(iloc->bh); return -EIO; - + } if (IS_I_VERSION(inode)) inode_inc_iversion(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 17adcb16a9c8..437f71fe83ae 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -126,6 +126,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); + brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || @@ -2811,7 +2812,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } - } + } else + brelse(iloc.bh); + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ebbc663d0798..a5efee34415f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -459,16 +459,18 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); - if (err) + if (err) { + brelse(bh); return err; + } ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", first_cluster, first_cluster - start, count2); ext4_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (unlikely(err)) return err; - brelse(bh); } return 0; @@ -605,7 +607,6 @@ handle_bb: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); - bh = NULL; goto out; } overhead = ext4_group_overhead_blocks(sb, group); @@ -618,9 +619,9 @@ handle_bb: ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (err) goto out; - brelse(bh); handle_ib: if (bg_flags[i] & EXT4_BG_INODE_UNINIT) @@ -635,18 +636,16 @@ handle_ib: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); - bh = NULL; goto out; } ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (err) goto out; - brelse(bh); } - bh = NULL; /* Mark group tables in block bitmap */ for (j = 0; j < GROUP_TABLE_COUNT; j++) { @@ -685,7 +684,6 @@ handle_ib: } out: - brelse(bh); err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; @@ -873,6 +871,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); + iloc.bh = NULL; goto exit_inode; } brelse(dind); @@ -924,6 +923,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, sizeof(struct buffer_head *), GFP_NOFS); if (!n_group_desc) { + brelse(gdb_bh); err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); @@ -939,8 +939,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb, kvfree(o_group_desc); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); - if (unlikely(err)) - brelse(gdb_bh); return err; } @@ -1124,8 +1122,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -2023,7 +2023,7 @@ retry: err = ext4_alloc_flex_bg_array(sb, n_group + 1); if (err) - return err; + goto out; err = ext4_mb_alloc_groupinfo(sb, n_group + 1); if (err) @@ -2059,6 +2059,10 @@ retry: n_blocks_count_retry = 0; free_flex_gd(flex_gd); flex_gd = NULL; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } goto retry; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a221f1cdf704..53ff6c2a26ed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4075,6 +4075,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + ret = -EINVAL; + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { @@ -4094,14 +4102,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto failed_mount; } - if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != - le32_to_cpu(es->s_inodes_count)) { - ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", - le32_to_cpu(es->s_inodes_count), - ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); - ret = -EINVAL; - goto failed_mount; - } bgl_lock_init(sbi->s_blockgroup_lock); @@ -4510,6 +4510,7 @@ failed_mount6: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f36fc5d5b257..7643d52c776c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1031,10 +1031,8 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, inode_lock(ea_inode); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); - if (ret) { - iloc.bh = NULL; + if (ret) goto out; - } ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; @@ -1080,12 +1078,10 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); - iloc.bh = NULL; if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: - brelse(iloc.bh); inode_unlock(ea_inode); return ret; } @@ -1388,6 +1384,12 @@ retry: bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); + if (!bh) { + WARN_ON_ONCE(1); + EXT4_ERROR_INODE(ea_inode, + "ext4_getblk() return bh = NULL"); + return -EFSCORRUPTED; + } ret = ext4_journal_get_write_access(handle, bh); if (ret) goto out; @@ -2276,8 +2278,10 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode) if (!bh) return ERR_PTR(-EIO); error = ext4_xattr_check_block(inode, bh); - if (error) + if (error) { + brelse(bh); return ERR_PTR(error); + } return bh; } @@ -2397,6 +2401,8 @@ retry_inode: error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + brelse(bs.bh); + bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; @@ -2617,6 +2623,8 @@ out: kfree(buffer); if (is) brelse(is->iloc.bh); + if (bs) + brelse(bs->bh); kfree(is); kfree(bs); @@ -2696,7 +2704,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct buffer_head *bh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; @@ -2737,13 +2744,17 @@ retry: * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { + struct buffer_head *bh; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); error = -EIO; if (!bh) goto cleanup; error = ext4_xattr_check_block(inode, bh); - if (error) + if (error) { + brelse(bh); goto cleanup; + } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; diff --git a/fs/file.c b/fs/file.c index 7ffd6e9d103d..50304c7525ea 100644 --- a/fs/file.c +++ b/fs/file.c @@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) - synchronize_sched(); + synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9edc920f651f..6d9cb1719de5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -730,6 +730,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob if (awaken) wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + /* Prevent a race with our last child, which has to signal EV_CLEARED * before dropping our spinlock. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ae813e609932..a5e516a40e7a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -165,9 +165,13 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) static void fuse_drop_waiting(struct fuse_conn *fc) { - if (fc->connected) { - atomic_dec(&fc->num_waiting); - } else if (atomic_dec_and_test(&fc->num_waiting)) { + /* + * lockess check of fc->connected is okay, because atomic_dec_and_test() + * provides a memory barrier mached with the one in fuse_wait_aborted() + * to ensure no wake-up is missed. + */ + if (atomic_dec_and_test(&fc->num_waiting) && + !READ_ONCE(fc->connected)) { /* wake up aborters */ wake_up_all(&fc->blocked_waitq); } @@ -1768,8 +1772,10 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.args[1].size = total_len; err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) + if (err) { fuse_retrieve_end(fc, req); + fuse_put_request(fc, req); + } return err; } @@ -2219,6 +2225,8 @@ EXPORT_SYMBOL_GPL(fuse_abort_conn); void fuse_wait_aborted(struct fuse_conn *fc) { + /* matches implicit memory barrier in fuse_drop_waiting() */ + smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 47395b0c3b35..e909678afa2d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1119,8 +1119,10 @@ static int fuse_permission(struct inode *inode, int mask) if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); + u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID; - if (time_before64(fi->i_time, get_jiffies_64())) { + if (perm_mask & READ_ONCE(fi->inval_mask) || + time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); @@ -1241,7 +1243,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file) static int fuse_dir_release(struct inode *inode, struct file *file) { - fuse_release_common(file, FUSE_RELEASEDIR); + fuse_release_common(file, true); return 0; } @@ -1249,7 +1251,25 @@ static int fuse_dir_release(struct inode *inode, struct file *file) static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return fuse_fsync_common(file, start, end, datasync, 1); + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (is_bad_inode(inode)) + return -EIO; + + if (fc->no_fsyncdir) + return 0; + + inode_lock(inode); + err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR); + if (err == -ENOSYS) { + fc->no_fsyncdir = 1; + err = 0; + } + inode_unlock(inode); + + return err; } static long fuse_dir_ioctl(struct file *file, unsigned int cmd, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index cc2121b37bf5..ffaffe18352a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -89,12 +89,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) iput(req->misc.release.inode); } -static void fuse_file_put(struct fuse_file *ff, bool sync) +static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open) { + if (ff->fc->no_open && !isdir) { /* * Drop the release request when client does not * implement 'open' @@ -247,10 +247,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) req->in.args[0].value = inarg; } -void fuse_release_common(struct file *file, int opcode) +void fuse_release_common(struct file *file, bool isdir) { struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; + int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(ff, file->f_flags, opcode); @@ -272,7 +273,7 @@ void fuse_release_common(struct file *file, int opcode) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL); + fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -288,7 +289,7 @@ static int fuse_release(struct inode *inode, struct file *file) if (fc->writeback_cache) write_inode_now(inode, 1); - fuse_release_common(file, FUSE_RELEASE); + fuse_release_common(file, false); /* return value is ignored by VFS */ return 0; @@ -302,7 +303,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags) * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" */ - fuse_file_put(ff, true); + fuse_file_put(ff, true, false); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -441,13 +442,30 @@ static int fuse_flush(struct file *file, fl_owner_t id) } int fuse_fsync_common(struct file *file, loff_t start, loff_t end, - int datasync, int isdir) + int datasync, int opcode) { struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.fsync_flags = datasync ? 1 : 0; + args.in.h.opcode = opcode; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + return fuse_simple_request(fc, &args); +} + +static int fuse_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); int err; if (is_bad_inode(inode)) @@ -479,34 +497,18 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (err) goto out; - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + if (fc->no_fsync) goto out; - memset(&inarg, 0, sizeof(inarg)); - inarg.fh = ff->fh; - inarg.fsync_flags = datasync ? 1 : 0; - args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); if (err == -ENOSYS) { - if (isdir) - fc->no_fsyncdir = 1; - else - fc->no_fsync = 1; + fc->no_fsync = 1; err = 0; } out: inode_unlock(inode); - return err; -} -static int fuse_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return fuse_fsync_common(file, start, end, datasync, 0); + return err; } void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, @@ -807,7 +809,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) put_page(page); } if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1460,7 +1462,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) __free_page(req->pages[i]); if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1619,7 +1621,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fc, fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, 0); + fuse_file_put(ff, false, false); return err; } @@ -1940,7 +1942,7 @@ static int fuse_writepages(struct address_space *mapping, err = 0; } if (data.ff) - fuse_file_put(data.ff, false); + fuse_file_put(data.ff, false, false); kfree(data.orig_pages); out: @@ -2924,10 +2926,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } if (io->async) { + bool blocking = io->blocking; + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (!io->blocking) + if (!blocking) return -EIOCBQUEUED; wait_for_completion(&wait); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e9f712e81c7d..2f2c92e6f8cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -822,13 +822,13 @@ void fuse_sync_release(struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request */ -void fuse_release_common(struct file *file, int opcode); +void fuse_release_common(struct file *file, bool isdir); /** * Send FSYNC or FSYNCDIR request */ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, - int datasync, int isdir); + int datasync, int opcode); /** * Notify poll wakeup diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0b94b23b02d4..568abed20eb2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -115,7 +115,7 @@ static void fuse_i_callback(struct rcu_head *head) static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -1068,6 +1068,7 @@ void fuse_dev_free(struct fuse_dev *fud) fuse_conn_put(fc); } + kfree(fud->pq.processing); kfree(fud); } EXPORT_SYMBOL_GPL(fuse_dev_free); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 8afbb35559b9..05dd78f4b2b3 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -820,10 +820,10 @@ out: * @page: the page that's being released * @gfp_mask: passed from Linux VFS, ignored by us * - * Call try_to_free_buffers() if the buffers in this page can be - * released. + * Calls try_to_free_buffers() to free the buffers and put the page if the + * buffers can be released. * - * Returns: 0 + * Returns: 1 if the page was put or else 0 */ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) @@ -930,14 +930,14 @@ static const struct address_space_operations gfs2_jdata_aops = { void gfs2_set_aops(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); - if (gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else if (gfs2_is_writeback(sdp)) inode->i_mapping->a_ops = &gfs2_writeback_aops; - else if (gfs2_is_ordered(ip)) + else if (gfs2_is_ordered(sdp)) inode->i_mapping->a_ops = &gfs2_ordered_aops; - else if (gfs2_is_jdata(ip)) - inode->i_mapping->a_ops = &gfs2_jdata_aops; else BUG(); } - diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a683d9b27d76..02b2646d84b3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/iomap.h> +#include <linux/ktime.h> #include "gfs2.h" #include "incore.h" @@ -826,7 +827,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto unlock; - iomap->private = dibh; + mp->mp_bh[0] = dibh; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_WRITE) { @@ -863,9 +864,6 @@ unstuff: len = lblock_stop - lblock + 1; iomap->length = len << inode->i_blkbits; - get_bh(dibh); - mp->mp_bh[0] = dibh; - height = ip->i_height; while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; @@ -898,8 +896,6 @@ out: iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); - if (ret && dibh) - brelse(dibh); return ret; do_alloc: @@ -980,9 +976,9 @@ static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos, static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct metapath *mp) { - struct metapath mp = { .mp_aheight = 1, }; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; @@ -996,9 +992,9 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, unstuff = gfs2_is_stuffed(ip) && pos + length > gfs2_max_stuffed_size(ip); - ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp); if (ret) - goto out_release; + goto out_unlock; alloc_required = unstuff || iomap->type == IOMAP_HOLE; @@ -1013,7 +1009,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, ret = gfs2_quota_lock_check(ip, &ap); if (ret) - goto out_release; + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) @@ -1038,17 +1034,15 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, ret = gfs2_unstuff_dinode(ip, NULL); if (ret) goto out_trans_end; - release_metapath(&mp); - brelse(iomap->private); - iomap->private = NULL; + release_metapath(mp); ret = gfs2_iomap_get(inode, iomap->offset, iomap->length, - flags, iomap, &mp); + flags, iomap, mp); if (ret) goto out_trans_end; } if (iomap->type == IOMAP_HOLE) { - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + ret = gfs2_iomap_alloc(inode, iomap, flags, mp); if (ret) { gfs2_trans_end(sdp); gfs2_inplace_release(ip); @@ -1056,7 +1050,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, goto out_qunlock; } } - release_metapath(&mp); if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip)) iomap->page_done = gfs2_iomap_journaled_page_done; return 0; @@ -1069,10 +1062,7 @@ out_trans_fail: out_qunlock: if (alloc_required) gfs2_quota_unlock(ip); -out_release: - if (iomap->private) - brelse(iomap->private); - release_metapath(&mp); +out_unlock: gfs2_write_unlock(inode); return ret; } @@ -1088,10 +1078,10 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, trace_gfs2_iomap_start(ip, pos, length, flags); if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) { - ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); } else { ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); - release_metapath(&mp); + /* * Silently fall back to buffered I/O for stuffed files or if * we've hot a hole (see gfs2_file_direct_write). @@ -1100,6 +1090,11 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->type != IOMAP_MAPPED) ret = -ENOTBLK; } + if (!ret) { + get_bh(mp.mp_bh[0]); + iomap->private = mp.mp_bh[0]; + } + release_metapath(&mp); trace_gfs2_iomap_end(ip, iomap, ret); return ret; } @@ -1908,10 +1903,16 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) if (ret < 0) goto out; - /* issue read-ahead on metadata */ - if (mp.mp_aheight > 1) { - for (; ret > 1; ret--) { - metapointer_range(&mp, mp.mp_aheight - ret, + /* On the first pass, issue read-ahead on metadata. */ + if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) { + unsigned int height = mp.mp_aheight - 1; + + /* No read-ahead for data blocks. */ + if (mp.mp_aheight - 1 == strip_h) + height--; + + for (; height >= mp.mp_aheight - ret; height--) { + metapointer_range(&mp, height, start_list, start_aligned, end_list, end_aligned, &start, &end); @@ -2083,6 +2084,8 @@ static int do_grow(struct inode *inode, u64 size) } error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (unstuff && + gfs2_is_jdata(ip) ? RES_JDATA : 0) + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? 0 : RES_QUOTA), 0); if (error) @@ -2248,7 +2251,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2273,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 05431324b262..b92740edc416 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1777,7 +1777,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * */ -void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5e12220cc0c2..8949bf28b249 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -202,7 +202,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c63bee9adb6a..f15b4c57c4bd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; @@ -466,17 +467,25 @@ static int inode_go_lock(struct gfs2_holder *gh) * */ -static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl) { - const struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gl->gl_object; + struct inode *inode = &ip->i_inode; + unsigned long nrpages; + if (ip == NULL) return; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", + + xa_lock_irq(&inode->i_data.i_pages); + nrpages = inode->i_data.nrpages; + xa_unlock_irq(&inode->i_data.i_pages); + + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu p:%lu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, - (unsigned long long)i_size_read(&ip->i_inode)); + (unsigned long long)i_size_read(inode), nrpages); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 888b62cfd6d1..e10e0b0a7cd5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -165,7 +165,6 @@ struct gfs2_bufdata { u64 bd_blkno; struct list_head bd_list; - const struct gfs2_log_operations *bd_ops; struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; @@ -244,7 +243,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 648f0ca1ad57..998051c4aea7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto fail_gunlock3; posix_acl_release(default_acl); + default_acl = NULL; } if (acl) { - if (!error) - error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto fail_gunlock3; posix_acl_release(acl); + acl = NULL; } - if (error) - goto fail_gunlock3; - error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) @@ -789,10 +791,8 @@ fail_free_inode: } gfs2_rsqa_delete(ip, NULL); fail_free_acls: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); + posix_acl_release(default_acl); + posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index b5b6341a4f5c..793808263c6d 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -30,16 +30,14 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip) return ip->i_diskflags & GFS2_DIF_JDATA; } -static inline int gfs2_is_writeback(const struct gfs2_inode *ip) +static inline bool gfs2_is_ordered(const struct gfs2_sbd *sdp) { - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); + return sdp->sd_args.ar_data == GFS2_DATA_ORDERED; } -static inline int gfs2_is_ordered(const struct gfs2_inode *ip) +static inline bool gfs2_is_writeback(const struct gfs2_sbd *sdp) { - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); + return sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK; } static inline int gfs2_is_dir(const struct gfs2_inode *ip) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 99dd58694ba1..5bfaf381921a 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -605,7 +605,6 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) bd->bd_blkno = bh->b_blocknr; gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; - bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); set_bit(GLF_LFLUSH, &gl->gl_flags); @@ -734,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); log_flush_wait(sdp); } @@ -811,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 20241436126d..1bc9bd444b28 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -51,12 +51,11 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (!gfs2_is_ordered(ip)) + if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp)) return; - sdp = GFS2_SB(&ip->i_inode); if (!test_bit(GIF_ORDERED, &ip->i_flags)) { spin_lock(&sdp->sd_ordered_lock); if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4c7069b8f3c1..94dcab655bc0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,7 +17,9 @@ #include <linux/bio.h> #include <linux/fs.h> #include <linux/list_sort.h> +#include <linux/blkdev.h> +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -193,7 +195,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -228,83 +229,86 @@ static void gfs2_end_log_write(struct bio *bio) } /** - * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock - * @op: REQ_OP - * @op_flags: req_flag_bits + * gfs2_log_submit_bio - Submit any pending log bio + * @biop: Address of the bio pointer + * @opf: REQ_OP | op_flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int opf) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); - submit_bio(sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + bio->bi_opf = opf; + submit_bio(bio); + *biop = NULL; } } /** - * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock - * @blkno: The next device block number we want to write to + * gfs2_log_alloc_bio - Allocate a bio + * @sdp: The super block + * @blkno: The device block number we want to write to + * @end_io: The bi_end_io callback * - * This should never be called when there is a cached bio in the - * super block. When it returns, there will be a cached bio in the - * super block which will have as many bio_vecs as the device is - * happy to handle. + * Allocate a new bio, initialize it with the given parameters and return it. * - * Returns: Newly allocated bio + * Returns: The newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, + bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio; - - BUG_ON(sdp->sd_log_bio); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio_set_dev(bio, sb->s_bdev); - bio->bi_end_io = gfs2_end_log_write; + bio->bi_end_io = end_io; bio->bi_private = sdp; - sdp->sd_log_bio = bio; - return bio; } /** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @sdp: The super block * @blkno: The device block number we want to write to + * @bio: The bio to get or allocate + * @op: REQ_OP + * @end_io: The bi_end_io callback + * @flush: Always flush the current bio and allocate a new one? * * If there is a cached bio, then if the next block number is sequential * with the previous one, return it, otherwise flush the bio to the - * device. If there is not a cached bio, or we just flushed it, then + * device. If there is no cached bio, or we just flushed it, then * allocate a new one. * * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, + struct bio **biop, int op, + bio_end_io_t *end_io, bool flush) { - struct bio *bio = sdp->sd_log_bio; - u64 nblk; + struct bio *bio = *biop; if (bio) { + u64 nblk; + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; - if (blkno == nblk) + if (blkno == nblk && !flush) return bio; - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(biop, op); } - return gfs2_log_alloc_bio(sdp, blkno); + *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); + return *biop; } /** @@ -326,11 +330,12 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE, + gfs2_end_log_write, false); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); - bio = gfs2_log_alloc_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, + REQ_OP_WRITE, gfs2_end_log_write, true); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); } @@ -370,6 +375,184 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + } + unlock_page(page); + } + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host uninitialized_var(lh); + void *kaddr = kmap_atomic(page); + unsigned int offset; + bool ret = false; + + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { + if (lh.lh_sequence > head->lh_sequence) + *head = lh; + else { + ret = true; + break; + } + } + } + kunmap_atomic(kaddr); + return ret; +} + +/** + * gfs2_jhead_process_page - Search/cleanup a page + * @jd: The journal descriptor + * @index: Index of the page to look into + * @done: If set, perform only cleanup, else search and set if found. + * + * Find the page with 'index' in the journal's mapping. Search the page for + * the journal head if requested (cleanup == false). Release refs on the + * page so the page cache can reclaim it (put_page() twice). We grabbed a + * reference on this page two times, first when we did a find_or_create_page() + * to obtain the page to add it to the bio and second when we do a + * find_get_page() here to get the page to wait on while I/O on it is being + * completed. + * This function is also used to free up a page we might've grabbed but not + * used. Maybe we added it to a bio, but not submitted it for I/O. Or we + * submitted the I/O, but we already found the jhead so we only need to drop + * our references to the page. + */ + +static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, + struct gfs2_log_header_host *head, + bool *done) +{ + struct page *page; + + page = find_get_page(jd->jd_inode->i_mapping, index); + wait_on_page_locked(page); + + if (PageError(page)) + *done = true; + + if (!*done) + *done = gfs2_jhead_pg_srch(jd, head, page); + + put_page(page); /* Once for find_get_page */ + put_page(page); /* Once more for find_or_create_page */ +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: The journal descriptor + * @head: The log descriptor for the head of the log is returned here + * + * Do a search of a journal by reading it in large chunks using bios and find + * the valid log entry with the highest sequence number. (i.e. the log head) + * + * Returns: 0 on success, errno otherwise + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct address_space *mapping = jd->jd_inode->i_mapping; + struct gfs2_journal_extent *je; + u32 block, read_idx = 0, submit_idx = 0, index = 0; + int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + int blocks_per_page = 1 << shift, sz, ret = 0; + struct bio *bio = NULL; + struct page *page; + bool done = false; + errseq_t since; + + memset(head, 0, sizeof(*head)); + if (list_empty(&jd->extent_list)) + gfs2_map_journal_extents(sdp, jd); + + since = filemap_sample_wb_err(mapping); + list_for_each_entry(je, &jd->extent_list, list) { + for (block = 0; block < je->blocks; block += blocks_per_page) { + index = (je->lblock + block) >> shift; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + done = true; + goto out; + } + + if (bio) { + sz = bio_add_page(bio, page, PAGE_SIZE, 0); + if (sz == PAGE_SIZE) + goto page_added; + submit_idx = index; + submit_bio(bio); + bio = NULL; + } + + bio = gfs2_log_alloc_bio(sdp, + je->dblock + (index << shift), + gfs2_end_log_read); + bio->bi_opf = REQ_OP_READ; + sz = bio_add_page(bio, page, PAGE_SIZE, 0); + gfs2_assert_warn(sdp, sz == PAGE_SIZE); + +page_added: + if (submit_idx <= read_idx + BIO_MAX_PAGES) { + /* Keep at least one bio in flight */ + continue; + } + + gfs2_jhead_process_page(jd, read_idx++, head, &done); + if (done) + goto out; /* found */ + } + } + +out: + if (bio) + submit_bio(bio); + while (read_idx <= index) + gfs2_jhead_process_page(jd, read_idx++, head, &done); + + if (!ret) + ret = filemap_check_wb_err(mapping, since); + + return ret; +} + static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, u32 ld_length, u32 ld_data1) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index e4949394f054..331160fc568b 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -30,8 +30,10 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags); +extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b041cb8ae383..1179763f6370 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -41,6 +41,7 @@ #include "dir.h" #include "meta_io.h" #include "trace_gfs2.h" +#include "lops.h" #define DO 0 #define UNDO 1 diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f938d1c..7389e445a7a7 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,159 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, - bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, + blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } - return error; -} - -/** - * find_good_lh - find a good log header - * @jd: the journal - * @blk: the segment to start searching from - * @lh: the log header to fill in - * @forward: if true search forward in the log, else search backward - * - * Call get_log_header() to get a log header for a segment, but if the - * segment is bad, either scan forward or backward until we find a good one. - * - * Returns: errno - */ - -static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, - struct gfs2_log_header_host *head) -{ - unsigned int orig_blk = *blk; - int error; - - for (;;) { - error = get_log_header(jd, *blk, head); - if (error <= 0) - return error; - - if (++*blk == jd->jd_blocks) - *blk = 0; - - if (*blk == orig_blk) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - } -} - -/** - * jhead_scan - make sure we've found the head of the log - * @jd: the journal - * @head: this is filled in with the log descriptor of the head - * - * At this point, seg and lh should be either the head of the log or just - * before. Scan forward until we find the head. - * - * Returns: errno - */ - -static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - unsigned int blk = head->lh_blkno; - struct gfs2_log_header_host lh; - int error; - - for (;;) { - if (++blk == jd->jd_blocks) - blk = 0; - - error = get_log_header(jd, blk, &lh); - if (error < 0) - return error; - if (error == 1) - continue; - - if (lh.lh_sequence == head->lh_sequence) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - if (lh.lh_sequence < head->lh_sequence) - break; - - *head = lh; - } - - return 0; -} - -/** - * gfs2_find_jhead - find the head of a log - * @jd: the journal - * @head: the log descriptor for the head of the log is returned here - * - * Do a binary search of a journal and find the valid log entry with the - * highest sequence number. (i.e. the log head) - * - * Returns: errno - */ - -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - struct gfs2_log_header_host lh_1, lh_m; - u32 blk_1, blk_2, blk_m; - int error; - - blk_1 = 0; - blk_2 = jd->jd_blocks - 1; - - for (;;) { - blk_m = (blk_1 + blk_2) / 2; - - error = find_good_lh(jd, &blk_1, &lh_1); - if (error) - return error; - - error = find_good_lh(jd, &blk_m, &lh_m); - if (error) - return error; - - if (blk_1 == blk_m || blk_m == blk_2) - break; - - if (lh_1.lh_sequence <= lh_m.lh_sequence) - blk_1 = blk_m; - else - blk_2 = blk_m; - } - - error = jhead_scan(jd, &lh_1); - if (error) - return error; - - *head = lh_1; - return error; } @@ -460,6 +348,8 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); + fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, + ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab4bf99..99575ab81202 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -27,10 +27,11 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, + const struct gfs2_log_header *lh, unsigned int blkno, + struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ffe3032b1043..831d7cb5a49c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -733,6 +733,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) if (gl) { glock_clear_object(gl, rgd); + gfs2_rgrp_brelse(rgd); gfs2_glock_put(gl); } @@ -1174,7 +1175,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. - * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. * * Returns: errno */ @@ -1779,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { + n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; - n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; @@ -2255,7 +2256,7 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, * */ -void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b596c3d17988..499079a9dbbe 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -72,7 +72,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ca71163ff7cf..d4b11c903971 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -45,6 +45,7 @@ #include "util.h" #include "sys.h" #include "xattr.h" +#include "lops.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 423bc2d03dd8..cd9a94a6b5bb 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -124,15 +124,13 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, - struct buffer_head *bh, - const struct gfs2_log_operations *lops) + struct buffer_head *bh) { struct gfs2_bufdata *bd; bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); bd->bd_bh = bh; bd->bd_gl = gl; - bd->bd_ops = lops; INIT_LIST_HEAD(&bd->bd_list); bh->b_private = bd; return bd; @@ -169,7 +167,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) gfs2_log_unlock(sdp); unlock_buffer(bh); if (bh->b_private == NULL) - bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; lock_buffer(bh); @@ -210,7 +208,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) unlock_buffer(bh); lock_page(bh->b_page); if (bh->b_private == NULL) - bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; unlock_page(bh->b_page); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 98b96ffb95ed..19017d296173 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -338,13 +338,14 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 236efe51eca6..66774f4cb4fd 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -466,14 +466,15 @@ void hfs_bmap_free(struct hfs_bnode *node) nidx -= len * 8; i = node->next; - hfs_bnode_put(node); if (!i) { /* panic */; pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); + hfs_bnode_put(node); return; } + hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; diff --git a/fs/inode.c b/fs/inode.c index 9e198f00b64c..35d2108d567c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,8 +730,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { + /* + * Recently referenced inodes and inodes with many attached pages + * get one more pass. + */ + if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..d6bc98ae8d35 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -142,13 +142,14 @@ static void iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) { + loff_t orig_pos = *pos; + loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); unsigned poff = offset_in_page(*pos); unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; - unsigned end = offset_in_page(i_size_read(inode)) >> block_bits; /* * If the block size is smaller than the page size we need to check the @@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (first <= end && last > end) - plen -= (last - end) * block_size; + if (orig_pos <= isize && orig_pos + length > isize) { + unsigned end = offset_in_page(isize - 1) >> block_bits; + + if (first <= end && last > end) + plen -= (last - end) * block_size; + } *offp = poff; *lenp = plen; @@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; bool use_fua = false; - int nr_pages, ret; + int nr_pages, ret = 0; size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) @@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; - } else { + } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this - * is a pure data IO that doesn't require any metadata - * updates and the underlying device supports FUA. This - * allows us to avoid cache flushes on IO completion. + * Use a FUA write if we need datasync semantics, this is a pure + * data IO that doesn't require any metadata updates (including + * after IO completion such as unwritten extent conversion) and + * the underlying device supports FUA. This allows us to avoid + * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_FUA) && @@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { + /* + * We have to stop part way through an IO. We must fall + * through to the sub-block tail zeroing here, otherwise + * this short IO may expose stale data in the tail of + * the block we haven't written data to. + */ bio_put(bio); - return copied ? copied : ret; + goto zero_tail; } n = bio->bi_iter.bi_size; @@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->submit.cookie = submit_bio(bio); } while (nr_pages); - if (need_zeroout) { + /* + * We need to zeroout the tail of a sub-block write if the extent type + * requires zeroing or the write extends beyond EOF. If we don't zero + * the block tail in the latter case, we can expose stale data via mmap + * reads of the EOF block. + */ +zero_tail: + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied; + return copied ? copied : ret; } static loff_t diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 902a7dd10e5c..bb6ae387469f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif mutex_lock(&c->alloc_sem); diff --git a/fs/namei.c b/fs/namei.c index 0cab6494978c..914178cdbe94 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3701,8 +3701,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/fs/namespace.c b/fs/namespace.c index 98d27da43304..a7f91265ea67 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -695,9 +695,6 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - /* might be worth a WARN_ON() */ - if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); mp->m_count++; return mp; } @@ -711,6 +708,9 @@ static struct mountpoint *get_mountpoint(struct dentry *dentry) int ret; if (d_mountpoint(dentry)) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); mountpoint: read_seqlock_excl(&mount_lock); mp = lookup_mountpoint(dentry); @@ -1540,8 +1540,13 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - event++; + /* Recheck MNT_LOCKED with the locks held */ + retval = -EINVAL; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto out; + + event++; if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); @@ -1555,6 +1560,7 @@ static int do_umount(struct mount *mnt, int flags) retval = 0; } } +out: unlock_mount_hash(); namespace_unlock(); return retval; @@ -1645,7 +1651,7 @@ int ksys_umount(char __user *name, int flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; - if (mnt->mnt.mnt_flags & MNT_LOCKED) + if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ goto dput_and_out; retval = -EPERM; if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) @@ -1728,8 +1734,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, for (s = r; s; s = next_mnt(s, r)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { - s = skip_mnt_tree(s); - continue; + if (s->mnt.mnt_flags & MNT_LOCKED) { + /* Both unbindable and locked. */ + q = ERR_PTR(-EPERM); + goto out; + } else { + s = skip_mnt_tree(s); + continue; + } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(s->mnt.mnt_root)) { @@ -1782,7 +1794,7 @@ void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); - umount_tree(real_mount(mnt), UMOUNT_SYNC); + umount_tree(real_mount(mnt), 0); unlock_mount_hash(); namespace_unlock(); } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index fa515d5ea5ba..315967354954 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -66,7 +66,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, out_iput: rcu_read_unlock(); trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); return res->status; @@ -108,7 +108,7 @@ __be32 nfs4_callback_recall(void *argp, void *resp, } trace_nfs4_cb_recall(cps->clp, &args->fh, inode, &args->stateid, -ntohl(res)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); return res; @@ -686,20 +686,24 @@ __be32 nfs4_callback_offload(void *data, void *dummy, { struct cb_offloadargs *args = data; struct nfs_server *server; - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; bool found = false; + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return htonl(NFS4ERR_SERVERFAULT); + spin_lock(&cps->clp->cl_lock); rcu_read_lock(); list_for_each_entry_rcu(server, &cps->clp->cl_superblocks, client_link) { - list_for_each_entry(copy, &server->ss_copies, copies) { + list_for_each_entry(tmp_copy, &server->ss_copies, copies) { if (memcmp(args->coa_stateid.other, - copy->stateid.other, + tmp_copy->stateid.other, sizeof(args->coa_stateid.other))) continue; - nfs4_copy_cb_args(copy, args); - complete(©->completion); + nfs4_copy_cb_args(tmp_copy, args); + complete(&tmp_copy->completion); found = true; goto out; } @@ -707,15 +711,11 @@ __be32 nfs4_callback_offload(void *data, void *dummy, out: rcu_read_unlock(); if (!found) { - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&cps->clp->cl_lock); - return htonl(NFS4ERR_SERVERFAULT); - } memcpy(©->stateid, &args->coa_stateid, NFS4_STATEID_SIZE); nfs4_copy_cb_args(copy, args); list_add_tail(©->copies, &cps->clp->pending_cb_stateids); - } + } else + kfree(copy); spin_unlock(&cps->clp->cl_lock); return 0; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 07b839560576..6ec2f78c1e19 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -850,16 +850,23 @@ nfs_delegation_find_inode_server(struct nfs_server *server, const struct nfs_fh *fhandle) { struct nfs_delegation *delegation; - struct inode *res = NULL; + struct inode *freeme, *res = NULL; list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); if (delegation->inode != NULL && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { - res = igrab(delegation->inode); + freeme = igrab(delegation->inode); + if (freeme && nfs_sb_active(freeme->i_sb)) + res = freeme; spin_unlock(&delegation->lock); if (res != NULL) return res; + if (freeme) { + rcu_read_unlock(); + iput(freeme); + rcu_read_lock(); + } return ERR_PTR(-EAGAIN); } spin_unlock(&delegation->lock); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aa12c3063bae..33824a0a57bf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -98,8 +98,11 @@ struct nfs_direct_req { struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; + /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ struct nfs_writeverf verf; /* unstable write verifier */ }; @@ -412,7 +415,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (!PageCompound(page) && bytes < hdr->good_bytes) + if (!PageCompound(page) && bytes < hdr->good_bytes && + (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); @@ -587,6 +591,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + if (iter_is_iovec(iter)) + dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 86bcba40ca61..310d7500f665 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1361,12 +1361,7 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_read_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_READ) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_read_prepare_common(task, hdr); } static void ff_layout_read_call_done(struct rpc_task *task, void *data) @@ -1542,12 +1537,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_write_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_WRITE) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_write_prepare_common(task, hdr); } static void ff_layout_write_call_done(struct rpc_task *task, void *data) @@ -1742,6 +1732,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; + + if (vers == 4 && + !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1804,6 +1799,10 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) if (fh) hdr->args.fh = fh; + if (vers == 4 && + !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 411798346e48..de50a342d5a5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -215,6 +215,10 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, unsigned int maxnum); struct nfs_fh * nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 74d8d5352438..d23347389626 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -370,6 +370,25 @@ out: return fh; } +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + + if (!ff_layout_mirror_valid(lseg, mirror, false)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", + __func__, mirror_idx); + goto out; + } + + nfs4_stateid_copy(stateid, &mirror->stateid); + return 1; +out: + return 0; +} + /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ac5b784a1de0..fed06fd9998d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -137,31 +137,32 @@ static int handle_async_copy(struct nfs42_copy_res *res, struct file *dst, nfs4_stateid *src_stateid) { - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; int status = NFS4_OK; bool found_pending = false; struct nfs_open_context *ctx = nfs_file_open_context(dst); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return -ENOMEM; + spin_lock(&server->nfs_client->cl_lock); - list_for_each_entry(copy, &server->nfs_client->pending_cb_stateids, + list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids, copies) { - if (memcmp(&res->write_res.stateid, ©->stateid, + if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, NFS4_STATEID_SIZE)) continue; found_pending = true; - list_del(©->copies); + list_del(&tmp_copy->copies); break; } if (found_pending) { spin_unlock(&server->nfs_client->cl_lock); + kfree(copy); + copy = tmp_copy; goto out; } - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&server->nfs_client->cl_lock); - return -ENOMEM; - } memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); init_completion(©->completion); copy->parent_state = ctx->state; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8d59c9655ec4..1b994b527518 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -41,6 +41,8 @@ enum nfs4_client_state { NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, + NFS4CLNT_RUN_MANAGER, + NFS4CLNT_DELEGRETURN_RUNNING, }; #define NFS4_RENEW_TIMEOUT 0x01 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 62ae0fd345ad..d8decf2ec48f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1210,6 +1210,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); @@ -2503,6 +2504,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; status = nfs4_purge_lease(clp); @@ -2593,19 +2595,24 @@ static void nfs4_state_manager(struct nfs_client *clp) } nfs4_end_drain_session(clp); - if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { - nfs_client_return_marked_delegations(clp); - continue; + nfs4_clear_state_manager_bit(clp); + + if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } + clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state); } - nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ - if (clp->cl_state == 0) - break; + if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - break; - } while (refcount_read(&clp->cl_count) > 1); - return; + return; + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); + goto out_drain; + out_error: if (strlen(section)) section_sep = ": "; @@ -2613,6 +2620,7 @@ out_error: " with error %d\n", section_sep, section, clp->cl_hostname, -status); ssleep(1); +out_drain: nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index edff074d38c7..d505990dac7c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1038,6 +1038,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; + if (!cstate->save_fh.fh_dentry) + return nfserr_nofilehandle; + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); if (status) { diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index de99db518571..f2129a5d9f23 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -266,9 +266,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - xa_lock_irq(&btnc->i_pages); - __xa_erase(&btnc->i_pages, newkey); - xa_unlock_irq(&btnc->i_pages); + xa_erase_irq(&btnc->i_pages, newkey); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5769cf3ff035..e08a6647267b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -115,12 +115,12 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, continue; mark = iter_info->marks[type]; /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! + * If the event is for a child and this mark doesn't care about + * events on a child, don't send it! */ - if (type == FSNOTIFY_OBJ_TYPE_INODE && - (event_mask & FS_EVENT_ON_CHILD) && - !(mark->mask & FS_EVENT_ON_CHILD)) + if (event_mask & FS_EVENT_ON_CHILD && + (type != FSNOTIFY_OBJ_TYPE_INODE || + !(mark->mask & FS_EVENT_ON_CHILD))) continue; marks_mask |= mark->mask; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2172ba516c61..d2c34900ae05 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -167,9 +167,9 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask parent = dget_parent(dentry); p_inode = parent->d_inode; - if (unlikely(!fsnotify_inode_watches_children(p_inode))) + if (unlikely(!fsnotify_inode_watches_children(p_inode))) { __fsnotify_update_child_dentry_flags(p_inode); - else if (p_inode->i_fsnotify_mask & mask) { + } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) { struct name_snapshot name; /* we are notifying a parent so come up with the new mask which @@ -339,6 +339,9 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, sb = mnt->mnt.mnt_sb; mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; } + /* An event "on child" is not intended for a mount/sb mark */ + if (mask & FS_EVENT_ON_CHILD) + mnt_or_sb_mask = 0; /* * Optimization: srcu_read_lock() has a memory barrier which can diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index da578ad4c08f..eb1ce30412dc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2411,8 +2411,16 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (bytes > 0 && private) - ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); + if (bytes <= 0) + mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld", + (long long)bytes); + if (private) { + if (bytes > 0) + ret = ocfs2_dio_end_io_write(inode, private, offset, + bytes); + else + ocfs2_dio_free_write_ctx(inode, private); + } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 308ea0eb35fd..a396096a5099 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -178,6 +178,15 @@ do { \ ##__VA_ARGS__); \ } while (0) +#define mlog_ratelimited(mask, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + mlog(mask, fmt, ##__VA_ARGS__); \ +} while (0) + #define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 9f88188060db..4bf8d5854b27 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -125,10 +125,10 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { - iput(inode); trace_ocfs2_get_dentry_generation((unsigned long long)blkno, handle->ih_generation, inode->i_generation); + iput(inode); result = ERR_PTR(-ESTALE); goto bail; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3f1685d7d43b..1565dd8e8856 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -157,18 +157,14 @@ out: } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -193,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); @@ -259,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -285,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -617,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 607092f367ad..1b2d0d2fe2ee 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -199,10 +199,11 @@ static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry child = dp->child; while (child) { - int n = strlen(child->path_component_name); + const char *node_name = kbasename(child->full_name); + int n = strlen(node_name); if (len == n && - !strncmp(child->path_component_name, name, len)) { + !strncmp(node_name, name, len)) { ent_type = op_inode_node; ent_data.node = child; ino = child->unique_id; @@ -245,7 +246,7 @@ found: set_nlink(inode, 2); break; case op_inode_prop: - if (!strcmp(dp->name, "options") && (len == 17) && + if (of_node_name_eq(dp, "options") && (len == 17) && !strncmp (name, "security-password", 17)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; else @@ -293,8 +294,8 @@ static int openpromfs_readdir(struct file *file, struct dir_context *ctx) } while (child) { if (!dir_emit(ctx, - child->path_component_name, - strlen(child->path_component_name), + kbasename(child->full_name), + strlen(kbasename(child->full_name)), child->unique_id, DT_DIR)) goto out; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index c6289147c787..82c129bfe58d 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -651,6 +651,18 @@ static int ovl_symlink(struct inode *dir, struct dentry *dentry, return ovl_create_object(dentry, S_IFLNK, 0, link); } +static int ovl_set_link_redirect(struct dentry *dentry) +{ + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_set_redirect(dentry, false); + revert_creds(old_cred); + + return err; +} + static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { @@ -670,7 +682,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, goto out_drop_write; if (ovl_is_metacopy_dentry(old)) { - err = ovl_set_redirect(old, false); + err = ovl_set_link_redirect(old); if (err) goto out_drop_write; } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 8fa37cd7818a..54e5d17d7f3e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -754,9 +754,8 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Otherwise, get a connected non-upper dir or disconnected non-dir */ - if (d_is_dir(origin.dentry) && - (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + /* Find origin.dentry again with ovl_acceptable() layer check */ + if (d_is_dir(origin.dentry)) { dput(origin.dentry); origin.dentry = NULL; err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); @@ -769,6 +768,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out_err; } + /* Get a connected non-upper dir or disconnected non-dir */ dentry = ovl_get_dentry(sb, NULL, &origin, index); out: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6bcc9dedc342..3b7ed5d2279c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -286,22 +286,13 @@ int ovl_permission(struct inode *inode, int mask) if (err) return err; - /* No need to do any access on underlying for special files */ - if (special_file(realinode->i_mode)) - return 0; - - /* No need to access underlying for execute */ - mask &= ~MAY_EXEC; - if ((mask & (MAY_READ | MAY_WRITE)) == 0) - return 0; - - /* Lower files get copied up, so turn write access into read */ - if (!upperinode && mask & MAY_WRITE) { + old_cred = ovl_override_creds(inode->i_sb); + if (!upperinode && + !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); + /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - - old_cred = ovl_override_creds(inode->i_sb); err = inode_permission(realinode, mask); revert_creds(old_cred); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 89921a0d2ebb..4d598a399bbf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode = new_inode(sb); if (!inode) - goto out; + return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); @@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); - inode = NULL; - goto out; + return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; @@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); -out: return inode; } @@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (!inode) + if (IS_ERR(inode)) { + err = ERR_CAST(inode); goto out; + } d_set_d_op(dentry, &proc_sys_dentry_operations); err = d_splice_alias(inode, dentry); @@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file, if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (!inode) { + if (IS_ERR(inode)) { d_lookup_done(child); dput(child); return false; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 06aab07b6bb7..b8a0931568f8 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -148,7 +148,7 @@ void pstore_unregister_ftrace(void) mutex_lock(&pstore_ftrace_lock); if (pstore_ftrace_enabled) { unregister_ftrace_function(&pstore_ftrace_ops); - pstore_ftrace_enabled = 0; + pstore_ftrace_enabled = false; } mutex_unlock(&pstore_ftrace_lock); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 8cf2218b46a7..c60ee46f3e39 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -335,53 +335,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) goto fail_alloc; private->record = record; - switch (record->type) { - case PSTORE_TYPE_DMESG: - scnprintf(name, sizeof(name), "dmesg-%s-%llu%s", - record->psi->name, record->id, - record->compressed ? ".enc.z" : ""); - break; - case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_OF: - scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_COMMON: - scnprintf(name, sizeof(name), "powerpc-common-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_OPAL: - scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%llu", - record->psi->name, record->id); - break; - default: - scnprintf(name, sizeof(name), "type%d-%s-%llu", - record->type, record->psi->name, record->id); - break; - } + scnprintf(name, sizeof(name), "%s-%s-%llu%s", + pstore_type_to_name(record->type), + record->psi->name, record->id, + record->compressed ? ".enc.z" : ""); dentry = d_alloc_name(root, name); if (!dentry) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b821054ca3ed..2d1066ed3c28 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -59,6 +59,19 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "enabling this option is not safe, it may lead to further " "corruption on Oopses)"); +/* Names should be in the same order as the enum pstore_type_id */ +static const char * const pstore_type_names[] = { + "dmesg", + "mce", + "console", + "ftrace", + "rtas", + "powerpc-ofw", + "powerpc-common", + "pmsg", + "powerpc-opal", +}; + static int pstore_new_entry; static void pstore_timefunc(struct timer_list *); @@ -104,6 +117,30 @@ void pstore_set_kmsg_bytes(int bytes) /* Tag each group of saved records with a sequence number */ static int oopscount; +const char *pstore_type_to_name(enum pstore_type_id type) +{ + BUILD_BUG_ON(ARRAY_SIZE(pstore_type_names) != PSTORE_TYPE_MAX); + + if (WARN_ON_ONCE(type >= PSTORE_TYPE_MAX)) + return "unknown"; + + return pstore_type_names[type]; +} +EXPORT_SYMBOL_GPL(pstore_type_to_name); + +enum pstore_type_id pstore_name_to_type(const char *name) +{ + int i; + + for (i = 0; i < PSTORE_TYPE_MAX; i++) { + if (!strcmp(pstore_type_names[i], name)) + return i; + } + + return PSTORE_TYPE_MAX; +} +EXPORT_SYMBOL_GPL(pstore_name_to_type); + static const char *get_reason_str(enum kmsg_dump_reason reason) { switch (reason) { @@ -124,26 +161,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } -bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +/* + * Should pstore_dump() wait for a concurrent pstore_dump()? If + * not, the current pstore_dump() will report a failure to dump + * and return. + */ +static bool pstore_cannot_wait(enum kmsg_dump_reason reason) { - /* - * In case of NMI path, pstore shouldn't be blocked - * regardless of reason. - */ + /* In NMI path, pstore shouldn't block regardless of reason. */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked by spin lock. */ + /* Emergency restart shouldn't be blocked. */ case KMSG_DUMP_EMERG: return true; default: return false; } } -EXPORT_SYMBOL_GPL(pstore_cannot_block_path); #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) static int zbufsize_deflate(size_t size) @@ -258,20 +296,6 @@ static int pstore_compress(const void *in, void *out, return outlen; } -static int pstore_decompress(void *in, void *out, - unsigned int inlen, unsigned int outlen) -{ - int ret; - - ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen); - if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); - return ret; - } - - return outlen; -} - static void allocate_buf_for_compression(void) { struct crypto_comp *ctx; @@ -318,7 +342,7 @@ static void allocate_buf_for_compression(void) big_oops_buf_sz = size; big_oops_buf = buf; - pr_info("Using compression: %s\n", zbackend->name); + pr_info("Using crash dump compression: %s\n", zbackend->name); } static void free_buf_for_compression(void) @@ -368,9 +392,8 @@ void pstore_record_init(struct pstore_record *record, } /* - * callback from kmsg_dump. (s2,l2) has the most recently - * written bytes, older bytes are in (s1,l1). Save as much - * as we can from the end of the buffer. + * callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the + * end of the buffer. */ static void pstore_dump(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) @@ -378,23 +401,23 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; - unsigned long flags = 0; - int is_locked; int ret; why = get_reason_str(reason); - if (pstore_cannot_block_path(reason)) { - is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); - if (!is_locked) { - pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" - , in_nmi() ? "NMI" : why); + if (down_trylock(&psinfo->buf_lock)) { + /* Failed to acquire lock: give up if we cannot wait. */ + if (pstore_cannot_wait(reason)) { + pr_err("dump skipped in %s path: may corrupt error record\n", + in_nmi() ? "NMI" : why); + return; + } + if (down_interruptible(&psinfo->buf_lock)) { + pr_err("could not grab semaphore?!\n"); return; } - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - is_locked = 1; } + oopscount++; while (total < kmsg_bytes) { char *dst; @@ -411,7 +434,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { dst = big_oops_buf; dst_size = big_oops_buf_sz; } else { @@ -429,7 +452,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size, &dump_size)) break; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { zipped_len = pstore_compress(dst, psinfo->buf, header_size + dump_size, psinfo->bufsize); @@ -452,8 +475,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + up(&psinfo->buf_lock); } static struct kmsg_dumper pstore_dumper = { @@ -476,31 +499,14 @@ static void pstore_unregister_kmsg(void) #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { - const char *e = s + c; - - while (s < e) { - struct pstore_record record; - unsigned long flags; + struct pstore_record record; - pstore_record_init(&record, psinfo); - record.type = PSTORE_TYPE_CONSOLE; - - if (c > psinfo->bufsize) - c = psinfo->bufsize; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; - if (oops_in_progress) { - if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) - break; - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - } - record.buf = (char *)s; - record.size = c; - psinfo->write(&record); - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - s += c; - c = e - s; - } + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); } static struct console pstore_console = { @@ -589,6 +595,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); + sema_init(&psinfo->buf_lock, 1); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -656,8 +663,9 @@ EXPORT_SYMBOL_GPL(pstore_unregister); static void decompress_record(struct pstore_record *record) { + int ret; int unzipped_len; - char *decompressed; + char *unzipped, *workspace; if (!record->compressed) return; @@ -668,35 +676,42 @@ static void decompress_record(struct pstore_record *record) return; } - /* No compression method has created the common buffer. */ + /* Missing compression buffer means compression was not initialized. */ if (!big_oops_buf) { - pr_warn("no decompression buffer allocated\n"); + pr_warn("no decompression method initialized!\n"); return; } - unzipped_len = pstore_decompress(record->buf, big_oops_buf, - record->size, big_oops_buf_sz); - if (unzipped_len <= 0) { - pr_err("decompression failed: %d\n", unzipped_len); + /* Allocate enough space to hold max decompression and ECC. */ + unzipped_len = big_oops_buf_sz; + workspace = kmalloc(unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + if (!workspace) return; - } - /* Build new buffer for decompressed contents. */ - decompressed = kmalloc(unzipped_len + record->ecc_notice_size, - GFP_KERNEL); - if (!decompressed) { - pr_err("decompression ran out of memory\n"); + /* After decompression "unzipped_len" is almost certainly smaller. */ + ret = crypto_comp_decompress(tfm, record->buf, record->size, + workspace, &unzipped_len); + if (ret) { + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); + kfree(workspace); return; } - memcpy(decompressed, big_oops_buf, unzipped_len); /* Append ECC notice to decompressed buffer. */ - memcpy(decompressed + unzipped_len, record->buf + record->size, + memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); - /* Swap out compresed contents with decompressed contents. */ + /* Copy decompressed contents into an minimum-sized allocation. */ + unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + kfree(workspace); + if (!unzipped) + return; + + /* Swap out compressed contents with decompressed contents. */ kfree(record->buf); - record->buf = decompressed; + record->buf = unzipped; record->size = unzipped_len; record->compressed = false; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index ffcff6516e89..96f7d32cd184 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -124,19 +124,17 @@ static int ramoops_pstore_open(struct pstore_info *psi) } static struct persistent_ram_zone * -ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, - u64 *id, - enum pstore_type_id *typep, enum pstore_type_id type, - bool update) +ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, + struct pstore_record *record) { struct persistent_ram_zone *prz; - int i = (*c)++; + bool update = (record->type == PSTORE_TYPE_DMESG); /* Give up if we never existed or have hit the end. */ - if (!przs || i >= max) + if (!przs) return NULL; - prz = przs[i]; + prz = przs[id]; if (!prz) return NULL; @@ -147,8 +145,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, if (!persistent_ram_old_size(prz)) return NULL; - *typep = type; - *id = i; + record->type = prz->type; + record->id = id; return prz; } @@ -255,10 +253,8 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { - prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, &record->id, - &record->type, - PSTORE_TYPE_DMESG, 1); + prz = ramoops_get_next_prz(cxt->dprzs, cxt->dump_read_cnt++, + record); if (!prz_ok(prz)) continue; header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), @@ -272,22 +268,18 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) } } - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, - 1, &record->id, &record->type, - PSTORE_TYPE_CONSOLE, 0); + if (!prz_ok(prz) && !cxt->console_read_cnt++) + prz = ramoops_get_next_prz(&cxt->cprz, 0 /* single */, record); - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, - 1, &record->id, &record->type, - PSTORE_TYPE_PMSG, 0); + if (!prz_ok(prz) && !cxt->pmsg_read_cnt++) + prz = ramoops_get_next_prz(&cxt->mprz, 0 /* single */, record); /* ftrace is last since it may want to dynamically allocate memory. */ if (!prz_ok(prz)) { - if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { - prz = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, 1, &record->id, - &record->type, PSTORE_TYPE_FTRACE, 0); + if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) && + !cxt->ftrace_read_cnt++) { + prz = ramoops_get_next_prz(cxt->fprzs, 0 /* single */, + record); } else { /* * Build a new dummy record which combines all the @@ -299,15 +291,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) GFP_KERNEL); if (!tmp_prz) return -ENOMEM; + prz = tmp_prz; free_prz = true; while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { prz_next = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, - cxt->max_ftrace_cnt, - &record->id, - &record->type, - PSTORE_TYPE_FTRACE, 0); + cxt->ftrace_read_cnt++, record); if (!prz_ok(prz_next)) continue; @@ -321,7 +310,6 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) goto out; } record->id = 0; - prz = tmp_prz; } } @@ -611,6 +599,7 @@ static int ramoops_init_przs(const char *name, goto fail; } *paddr += zone_sz; + prz_ar[i]->type = pstore_name_to_type(name); } *przs = prz_ar; @@ -640,7 +629,7 @@ static int ramoops_init_prz(const char *name, label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, - cxt->memtype, 0, label); + cxt->memtype, PRZ_FLAG_ZAP_OLD, label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -649,9 +638,8 @@ static int ramoops_init_prz(const char *name, return err; } - persistent_ram_zap(*prz); - *paddr += sz; + (*prz)->type = pstore_name_to_type(name); return 0; } @@ -787,7 +775,7 @@ static int ramoops_probe(struct platform_device *pdev) dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size - cxt->pmsg_size; - err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr, + err = ramoops_init_przs("dmesg", dev, cxt, &cxt->dprzs, &paddr, dump_mem_sz, cxt->record_size, &cxt->max_dump_cnt, 0, 0); if (err) @@ -816,21 +804,17 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we - * have to handle dumps, we must have at least record_size buffer. And - * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be - * ZERO_SIZE_PTR). + * Since bufsize is only used for dmesg crash dumps, it + * must match the size of the dprz record (after PRZ header + * and ECC bytes have been accounted for). */ - if (cxt->console_size) - cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ - cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); - cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { - pr_err("cannot allocate pstore buffer\n"); + pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; goto fail_clear; } - spin_lock_init(&cxt->pstore.buf_lock); cxt->pstore.flags = PSTORE_FLAGS_DMESG; if (cxt->console_size) @@ -858,9 +842,9 @@ static int ramoops_probe(struct platform_device *pdev) ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; - pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", + pr_info("using 0x%lx@0x%llx, ecc: %d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); + cxt->ecc_info.ecc_size); return 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 12e21f789194..c11711c2cc83 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -12,7 +12,7 @@ * */ -#define pr_fmt(fmt) "persistent_ram: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> @@ -29,6 +29,16 @@ #include <linux/vmalloc.h> #include <asm/page.h> +/** + * struct persistent_ram_buffer - persistent circular RAM buffer + * + * @sig: + * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) + * @start: + * offset into @data where the beginning of the stored bytes begin + * @size: + * number of valid bytes stored in @data + */ struct persistent_ram_buffer { uint32_t sig; atomic_t start; @@ -443,7 +453,8 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size, void *va; if (!request_mem_region(start, size, label ?: "ramoops")) { - pr_err("request mem region (0x%llx@0x%llx) failed\n", + pr_err("request mem region (%s 0x%llx@0x%llx) failed\n", + label ?: "ramoops", (unsigned long long)size, (unsigned long long)start); return NULL; } @@ -489,32 +500,42 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, struct persistent_ram_ecc_info *ecc_info) { int ret; + bool zap = !!(prz->flags & PRZ_FLAG_ZAP_OLD); ret = persistent_ram_init_ecc(prz, ecc_info); - if (ret) + if (ret) { + pr_warn("ECC failed %s\n", prz->label); return ret; + } sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || - buffer_start(prz) > buffer_size(prz)) + buffer_start(prz) > buffer_size(prz)) { pr_info("found existing invalid buffer, size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); - else { + zap = true; + } else { pr_debug("found existing buffer, size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); - return 0; } } else { pr_debug("no valid data in buffer (sig = 0x%08x)\n", prz->buffer->sig); + prz->buffer->sig = sig; + zap = true; } - /* Rewind missing or invalid memory area. */ - prz->buffer->sig = sig; - persistent_ram_zap(prz); + /* Reset missing, invalid, or single-use memory area. */ + if (zap) + persistent_ram_zap(prz); return 0; } @@ -572,6 +593,12 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; + pr_debug("attached %s 0x%zx@0x%llx: %zu header, %zu data, %zu ecc (%d/%d)\n", + prz->label, prz->size, (unsigned long long)prz->paddr, + sizeof(*prz->buffer), prz->buffer_size, + prz->size - sizeof(*prz->buffer) - prz->buffer_size, + prz->ecc_info.ecc_size, prz->ecc_info.block_size); + return prz; err: persistent_ram_free(prz); diff --git a/fs/read_write.c b/fs/read_write.c index bfcb4ced5664..58f30537c47a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1956,7 +1956,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_out = file_inode(file_out); loff_t ret; - WARN_ON_ONCE(remap_flags); + WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) off = same->src_offset; len = same->src_length; - ret = -EISDIR; if (S_ISDIR(src->i_mode)) - goto out; + return -EISDIR; - ret = -EINVAL; if (!S_ISREG(src->i_mode)) - goto out; + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) - goto out; + return ret; ret = 0; if (off + len > i_size_read(src)) @@ -2147,10 +2148,8 @@ next_fdput: fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) - goto out; + break; } - -out: return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/splice.c b/fs/splice.c index 3553f1956508..de2ede048473 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -945,11 +945,16 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; + WARN_ON_ONCE(pipe->nrbufs != 0); + while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = do_splice_to(in, &pos, pipe, len, flags); + /* Don't try to read more the pipe has space for. */ + read_len = min_t(size_t, len, + (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT); + ret = do_splice_to(in, &pos, pipe, read_len, flags); if (unlikely(ret <= 0)) goto out_release; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 0a7252aecfa5..bb71db63c99c 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -334,7 +334,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); -int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) +int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr) { int err = 0; int i; @@ -493,7 +493,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) return ret; } -void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) +void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; for (i = 0; ptr[i]; i++) diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 499a20a5a010..273736f41be3 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -275,7 +275,7 @@ static int __sysv_write_inode(struct inode *inode, int wait) } } brelse(bh); - return 0; + return err; } int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 529856fbccd0..bc1e082d921d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -12,9 +12,10 @@ config UBIFS_FS help UBIFS is a file system for flash devices which works on top of UBI. +if UBIFS_FS + config UBIFS_FS_ADVANCED_COMPR bool "Advanced compression options" - depends on UBIFS_FS help This option allows to explicitly choose which compressions, if any, are enabled in UBIFS. Removing compressors means inability to read @@ -24,7 +25,6 @@ config UBIFS_FS_ADVANCED_COMPR config UBIFS_FS_LZO bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR - depends on UBIFS_FS default y help LZO compressor is generally faster than zlib but compresses worse. @@ -32,14 +32,12 @@ config UBIFS_FS_LZO config UBIFS_FS_ZLIB bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR - depends on UBIFS_FS default y help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. config UBIFS_ATIME_SUPPORT - bool "Access time support" if UBIFS_FS - depends on UBIFS_FS + bool "Access time support" default n help Originally UBIFS did not support atime, because it looked like a bad idea due @@ -54,7 +52,6 @@ config UBIFS_ATIME_SUPPORT config UBIFS_FS_XATTR bool "UBIFS XATTR support" - depends on UBIFS_FS default y help Saying Y here includes support for extended attributes (xattrs). @@ -65,7 +62,7 @@ config UBIFS_FS_XATTR config UBIFS_FS_ENCRYPTION bool "UBIFS Encryption" - depends on UBIFS_FS && UBIFS_FS_XATTR && BLOCK + depends on UBIFS_FS_XATTR && BLOCK select FS_ENCRYPTION default n help @@ -76,7 +73,7 @@ config UBIFS_FS_ENCRYPTION config UBIFS_FS_SECURITY bool "UBIFS Security Labels" - depends on UBIFS_FS && UBIFS_FS_XATTR + depends on UBIFS_FS_XATTR default y help Security labels provide an access control facility to support Linux @@ -89,6 +86,7 @@ config UBIFS_FS_SECURITY config UBIFS_FS_AUTHENTICATION bool "UBIFS authentication support" + depends on KEYS select CRYPTO_HMAC help Enable authentication support for UBIFS. This feature offers protection @@ -96,3 +94,5 @@ config UBIFS_FS_AUTHENTICATION If you say yes here you should also select a hashing algorithm such as sha256, these are not selected automatically since there are many different options. + +endif # UBIFS_FS diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 124e965a28b3..5bf5fd08879e 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -269,8 +269,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out; } - c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, - CRYPTO_ALG_ASYNC); + c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0); if (IS_ERR(c->hash_tfm)) { err = PTR_ERR(c->hash_tfm); ubifs_err(c, "Can not allocate %s: %d", @@ -286,7 +285,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out_free_hash; } - c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC); + c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); if (IS_ERR(c->hmac_tfm)) { err = PTR_ERR(c->hmac_tfm); ubifs_err(c, "Can not allocate %s: %d", hmac_name, err); diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index d1d5e96350dd..b0c5f06128b5 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1675,6 +1675,12 @@ int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash) if (!ubifs_authenticated(c)) return 0; + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + desc = ubifs_hash_get_desc(c); if (IS_ERR(desc)) return PTR_ERR(desc); @@ -1685,12 +1691,6 @@ int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash) goto out; } - if (!c->nroot) { - err = ubifs_read_nnode(c, NULL, 0); - if (err) - return err; - } - cnode = (struct ubifs_cnode *)c->nroot; while (cnode) { diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 75f961c4c044..0a0e65c07c6d 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -213,6 +213,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) } /** + * inode_still_linked - check whether inode in question will be re-linked. + * @c: UBIFS file-system description object + * @rino: replay entry to test + * + * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1. + * This case needs special care, otherwise all references to the inode will + * be removed upon the first replay entry of an inode with link count 0 + * is found. + */ +static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino) +{ + struct replay_entry *r; + + ubifs_assert(c, rino->deletion); + ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY); + + /* + * Find the most recent entry for the inode behind @rino and check + * whether it is a deletion. + */ + list_for_each_entry_reverse(r, &c->replay_list, list) { + ubifs_assert(c, r->sqnum >= rino->sqnum); + if (key_inum(c, &r->key) == key_inum(c, &rino->key)) + return r->deletion == 0; + + } + + ubifs_assert(c, 0); + return false; +} + +/** * apply_replay_entry - apply a replay entry to the TNC. * @c: UBIFS file-system description object * @r: replay entry to apply @@ -239,6 +271,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { ino_t inum = key_inum(c, &r->key); + if (inode_still_linked(c, r)) { + err = 0; + break; + } + err = ubifs_tnc_remove_ino(c, inum); break; } @@ -533,6 +570,28 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) return data == 0xFFFFFFFF; } +/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */ +static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash) +{ + SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); + + hash_desc->tfm = c->hash_tfm; + hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ubifs_shash_copy_state(c, log_hash, hash_desc); + return crypto_shash_final(hash_desc, hash); +} + +static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) +{ + SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); + + hmac_desc->tfm = c->hmac_tfm; + hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); +} + /** * authenticate_sleb - authenticate one scan LEB * @c: UBIFS file-system description object @@ -574,21 +633,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type == UBIFS_AUTH_NODE) { struct ubifs_auth_node *auth = snod->node; - SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); - SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); - - hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - ubifs_shash_copy_state(c, log_hash, hash_desc); - err = crypto_shash_final(hash_desc, hash); + err = authenticate_sleb_hash(c, log_hash, hash); if (err) goto out; - hmac_desc->tfm = c->hmac_tfm; - hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_shash_digest(hmac_desc, hash, c->hash_len, - hmac); + err = authenticate_sleb_hmac(c, hash, hmac); if (err) goto out; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 75a69dd26d6e..3da90c951c23 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -63,6 +63,17 @@ /* Default time granularity in nanoseconds */ #define DEFAULT_TIME_GRAN 1000000000 +static int get_default_compressor(struct ubifs_info *c) +{ + if (ubifs_compr_present(c, UBIFS_COMPR_LZO)) + return UBIFS_COMPR_LZO; + + if (ubifs_compr_present(c, UBIFS_COMPR_ZLIB)) + return UBIFS_COMPR_ZLIB; + + return UBIFS_COMPR_NONE; +} + /** * create_default_filesystem - format empty UBI volume. * @c: UBIFS file-system description object @@ -207,7 +218,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (c->mount_opts.override_compr) sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); else - sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + sup->default_compr = cpu_to_le16(get_default_compressor(c)); generate_random_uuid(sup->uuid); diff --git a/fs/udf/super.c b/fs/udf/super.c index 8f2f56d9a1bb..e3d684ea3203 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -827,16 +827,20 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); - if (ret < 0) - goto out_bh; - - strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); + if (ret < 0) { + strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName"); + pr_warn("incorrect volume identification, setting to " + "'InvalidName'\n"); + } else { + strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); + } udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128); - if (ret < 0) + if (ret < 0) { + ret = 0; goto out_bh; - + } outstr[ret] = 0; udf_debug("volSetIdent[] = '%s'\n", outstr); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 45234791fec2..5fcfa96463eb 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -351,6 +351,11 @@ try_again: return u_len; } +/* + * Convert CS0 dstring to output charset. Warning: This function may truncate + * input string if it is too long as it is used for informational strings only + * and it is better to truncate the string than to refuse mounting a media. + */ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len) { @@ -359,9 +364,12 @@ int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, if (i_len > 0) { s_len = ocu_i[i_len - 1]; if (s_len >= i_len) { - pr_err("incorrect dstring lengths (%d/%d)\n", - s_len, i_len); - return -EINVAL; + pr_warn("incorrect dstring lengths (%d/%d)," + " truncating\n", s_len, i_len); + s_len = i_len - 1; + /* 2-byte encoding? Need to round properly... */ + if (ocu_i[0] == 16) + s_len -= (s_len - 1) & 2; } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 356d2b8568c1..59dc28047030 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -926,7 +926,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&wqh->lock)); + lockdep_assert_held(&wqh->lock); uwq = NULL; if (!waitqueue_active(wqh)) @@ -1361,6 +1361,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!vma_can_userfault(cur)) goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + /* * If this vma contains ending address, and huge pages * check alignment. @@ -1406,6 +1419,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1560,6 +1574,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (!vma->vm_userfaultfd_ctx.ctx) goto skip; + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6fc5425b1474..2652d00842d6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -243,7 +243,7 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; - uint16_t end; + uint32_t end; /* must be 32bit - see below */ int i; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -293,6 +293,11 @@ xfs_attr3_leaf_verify( /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. + * + * Note that for 64k block size filesystems, the freemap entries cannot + * overflow as they are only be16 fields. However, when checking end + * pointer of the freemap, we have to be careful to detect overflows and + * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) @@ -303,7 +308,9 @@ xfs_attr3_leaf_verify( return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; - end = ichdr.freemap[i].base + ichdr.freemap[i].size; + + /* be care of 16 bit overflows here */ + end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74d7228e755b..19e921d1586f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 34c6d7bd4d18..bbdae2b4559f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return __this_address; + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 86c50208a143..7fbf8af0b159 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -538,15 +538,18 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agnumber_t agno) { + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_inobt_mxr[0] == 0) return 0; return xfs_btree_calc_size(mp->m_inobt_mnr, - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / - XFS_INODES_PER_CHUNK); + (uint64_t)agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); } static int @@ -594,7 +597,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp); + *ask += xfs_inobt_max_size(mp, agno); *used += tree_len; return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5d263dfdb3bc..1ee8c5539fa4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1042,7 +1042,7 @@ out_trans_cancel: goto out_unlock; } -static int +int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1126,9 +1126,9 @@ xfs_free_file_space( * page could be mmap'd and iomap_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ - if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - (offset + len) & ~PAGE_MASK, LLONG_MAX); + round_down(offset + len, PAGE_SIZE), LLONG_MAX); } return error; @@ -1195,13 +1195,7 @@ xfs_prepare_shift( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); - if (error) - return error; - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_SHIFT, -1); - if (error) - return error; + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 87363d136bb6..7a78229cf1a7 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); +int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 53c9ab8fb777..e47425071e65 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -920,7 +920,7 @@ out_unlock: } -loff_t +STATIC loff_t xfs_file_remap_range( struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6e2c08f30f60..6ecdbb3af7de 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1608,7 +1608,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: kmem_free(buf); - return 0; + return error; } struct getfsmap_info { diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 576c375ce12a..6b736ea58d35 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -107,5 +107,5 @@ assfail(char *expr, char *file, int line) void xfs_hex_dump(void *p, int length) { - print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 73a1d77ec187..3091e4bc04ef 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot( statp->f_files = limit; statp->f_ffree = (statp->f_files > dqp->q_res_icount) ? - (statp->f_ffree - dqp->q_res_icount) : 0; + (statp->f_files - dqp->q_res_icount) : 0; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ecdb086bc23e..322a852ce284 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -296,6 +296,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } @@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, - round_down(pos_out, PAGE_SIZE), - round_up(pos_out + *len, PAGE_SIZE) - 1); + /* + * If pos_out > EOF, we may have dirtied blocks between EOF and + * pos_out. In that case, we need to extend the flush and unmap to cover + * from EOF to the end of the copy length. + */ + if (pos_out > XFS_ISIZE(dest)) { + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); + } else { + ret = xfs_flush_unmap_range(dest, pos_out, *len); + } + if (ret) + goto out_unlock; return 1; out_unlock: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3043e5ed6495..8a6532aae779 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + if (bp->b_bn == XFS_BUF_DADDR_NULL) + __entry->bno = bp->b_maps[0].bm_bn; + else + __entry->bno = bp->b_bn; __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); |