From 2749f7ef3643fea5ac73b51682d988c3571ec1f9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Sep 2021 15:22:07 +0800 Subject: btrfs: subpage: avoid potential deadlock with compression and delalloc [BUG] With experimental subpage compression enabled, a simple fsstress can lead to self deadlock on page 720896: mkfs.btrfs -f -s 4k $dev > /dev/null mount $dev -o compress $mnt $fsstress -p 1 -n 100 -w -d $mnt -v -s 1625511156 [CAUSE] If we have a file layout looks like below: 0 32K 64K 96K 128K |//| |///////////////| 4K Then we run delalloc range for the inode, it will: - Call find_lock_delalloc_range() with @delalloc_start = 0 Then we got a delalloc range [0, 4K). This range will be COWed. - Call find_lock_delalloc_range() again with @delalloc_start = 4K Since find_lock_delalloc_range() never cares whether the range is still inside page range [0, 64K), it will return range [64K, 128K). This range meets the condition for subpage compression, will go through async COW path. And async COW path will return @page_started. But that @page_started is now for range [64K, 128K), not for range [0, 64K). - writepage_dellloc() returned 1 for page [0, 64K) Thus page [0, 64K) will not be unlocked, nor its page dirty status will be cleared. Next time when we try to lock page [0, 64K) we will deadlock, as there is no one to release page [0, 64K). This problem will never happen for regular page size as one page only contains one sector. After the first find_lock_delalloc_range() call, the @delalloc_end will go beyond @page_end no matter if we found a delalloc range or not Thus this bug only happens for subpage, as now we need multiple runs to exhaust the delalloc range of a page. [FIX] Fix the problem by ensuring the delalloc range we ran at least started inside @locked_page. So that we will never get incorrect @page_started. And to prevent such problem from happening again: - Make find_lock_delalloc_range() return false if the found range is beyond @end value passed in. Since @end will be utilized now, add an ASSERT() to ensure we pass correct @end into find_lock_delalloc_range(). This also means, for selftests we needs to populate @end before calling find_lock_delalloc_range(). - New ASSERT() in find_lock_delalloc_range() Now we will make sure the @start/@end passed in at least covers part of the page. - New ASSERT() in run_delalloc_range() To make sure the range at least starts inside @locked page. - Use @delalloc_start as proper cursor, while @delalloc_end is always reset to @page_end. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++++++++--------- fs/btrfs/inode.c | 7 +++++++ fs/btrfs/tests/extent-io-tests.c | 12 ++++++------ 3 files changed, 40 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 18ef234fe3dc..d022654af420 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* * Find and lock a contiguous range of bytes in the file marked as delalloc, no - * more than @max_bytes. @Start and @end are used to return the range, + * more than @max_bytes. * - * Return: true if we find something - * false if nothing was in the tree + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, @@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, u64 *end) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, int ret; int loops = 0; + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { *start = delalloc_start; - *end = delalloc_end; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); free_extent_state(cached_state); return false; } @@ -3771,16 +3789,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, unsigned long *nr_written) { - u64 page_end = page_offset(page) + PAGE_SIZE - 1; - bool found; + const u64 page_end = page_offset(page) + PAGE_SIZE - 1; u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; - u64 delalloc_end = 0; int ret; int page_started = 0; + while (delalloc_start < page_end) { + u64 delalloc_end = page_end; + bool found; - while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 57a2bc25b4d2..0276fa951641 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1955,6 +1955,13 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page int ret; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + /* + * The range must cover part of the @locked_page, or the returned + * @page_started can confuse the caller. + */ + ASSERT(!(end <= page_offset(locked_page) || + start >= page_offset(locked_page) + PAGE_SIZE)); + if (should_nocow(inode, start, end)) { /* * Normally on a zoned device we're only doing COW writes, but diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 73e96d505f4f..c2e72e7a8ff0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize) } set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize) goto out_bits; } start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; } - if (end != (u64)-1) { + if (end != test_start + PAGE_SIZE - 1) { test_err("did not return the proper end offset"); goto out_bits; } @@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) /* We unlocked it in the previous test */ lock_page(locked_page); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; /* * Currently if we fail to find dirty pages in the delalloc range we * will adjust max_bytes down to PAGE_SIZE and then re-search. If -- cgit v1.2.3