From 96dbcc0072acf4f9565a16e8da96e57e5cee1068 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 3 Oct 2022 15:57:30 +0100
Subject: btrfs: add missing path cache update during fiemap

When looking the stored result for a cached path node, if the stored
result is valid and has a value of true, we must update all the nodes for
all levels below it with a result of true as well. This is necessary when
moving from one leaf in the fs tree to the next one, as well as when
moving from a node at any level to the next node at the same level.

Currently this logic is missing as it was somehow forgotten by a recent
patch with the subject: "btrfs: speedup checking for extent sharedness
during fiemap".

This adds the missing logic, which is the counter part to what we do
when adding a shared node to the cache at store_backref_shared_cache().

Fixes: 12a824dc67a6 ("btrfs: speedup checking for extent sharedness during fiemap")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index dce3a16996b9..3c0c1f626c75 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1557,6 +1557,19 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache
 		return false;
 
 	*is_shared = entry->is_shared;
+	/*
+	 * If the node at this level is shared, than all nodes below are also
+	 * shared. Currently some of the nodes below may be marked as not shared
+	 * because we have just switched from one leaf to another, and switched
+	 * also other nodes above the leaf and below the current level, so mark
+	 * them as shared.
+	 */
+	if (*is_shared) {
+		for (int i = 0; i < level; i++) {
+			cache->entries[i].is_shared = true;
+			cache->entries[i].gen = entry->gen;
+		}
+	}
 
 	return true;
 }
-- 
cgit v1.2.3


From 4fc7b57228243d09c0d878873bf24fa64a90fa01 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:51 +0100
Subject: btrfs: fix processing of delayed data refs during backref walking

When processing delayed data references during backref walking and we are
using a share context (we are being called through fiemap), whenever we
find a delayed data reference for an inode different from the one we are
interested in, then we immediately exit and consider the data extent as
shared. This is wrong, because:

1) This might be a DROP reference that will cancel out a reference in the
   extent tree;

2) Even if it's an ADD reference, it may be followed by a DROP reference
   that cancels it out.

In either case we should not exit immediately.

Fix this by never exiting when we find a delayed data reference for
another inode - instead add the reference and if it does not cancel out
other delayed reference, we will exit early when we call
extent_is_shared() after processing all delayed references. If we find
a drop reference, then signal the code that processes references from
the extent tree (add_inline_refs() and add_keyed_refs()) to not exit
immediately if it finds there a reference for another inode, since we
have delayed drop references that may cancel it out. In this later case
we exit once we don't have references in the rb trees that cancel out
each other and have two references for different inodes.

Example reproducer for case 1):

   $ cat test-1.sh
   #!/bin/bash

   DEV=/dev/sdj
   MNT=/mnt/sdj

   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   xfs_io -f -c "pwrite 0 64K" $MNT/foo
   cp --reflink=always $MNT/foo $MNT/bar

   echo
   echo "fiemap after cloning:"
   xfs_io -c "fiemap -v" $MNT/foo

   rm -f $MNT/bar
   echo
   echo "fiemap after removing file bar:"
   xfs_io -c "fiemap -v" $MNT/foo

   umount $MNT

Running it before this patch, the extent is still listed as shared, it has
the flag 0x2000 (FIEMAP_EXTENT_SHARED) set:

   $ ./test-1.sh
   fiemap after cloning:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

   fiemap after removing file bar:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

Example reproducer for case 2):

   $ cat test-2.sh
   #!/bin/bash

   DEV=/dev/sdj
   MNT=/mnt/sdj

   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   xfs_io -f -c "pwrite 0 64K" $MNT/foo
   cp --reflink=always $MNT/foo $MNT/bar

   # Flush delayed references to the extent tree and commit current
   # transaction.
   sync

   echo
   echo "fiemap after cloning:"
   xfs_io -c "fiemap -v" $MNT/foo

   rm -f $MNT/bar
   echo
   echo "fiemap after removing file bar:"
   xfs_io -c "fiemap -v" $MNT/foo

   umount $MNT

Running it before this patch, the extent is still listed as shared, it has
the flag 0x2000 (FIEMAP_EXTENT_SHARED) set:

   $ ./test-2.sh
   fiemap after cloning:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

   fiemap after removing file bar:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

After this patch, after deleting bar in both tests, the extent is not
reported with the 0x2000 flag anymore, it gets only the flag 0x1
(which is FIEMAP_EXTENT_LAST):

   $ ./test-1.sh
   fiemap after cloning:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

   fiemap after removing file bar:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128   0x1

   $ ./test-2.sh
   fiemap after cloning:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128 0x2001

   fiemap after removing file bar:
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..127]:        26624..26751       128   0x1

These tests will later be converted to a test case for fstests.

Fixes: dc046b10c8b7d4 ("Btrfs: make fiemap not blow when you have lots of snapshots")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3c0c1f626c75..cf47dabb786f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -138,6 +138,7 @@ struct share_check {
 	u64 root_objectid;
 	u64 inum;
 	int share_count;
+	bool have_delayed_delete_refs;
 };
 
 static inline int extent_is_shared(struct share_check *sc)
@@ -884,13 +885,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			key.offset = ref->offset;
 
 			/*
-			 * Found a inum that doesn't match our known inum, we
-			 * know it's shared.
+			 * If we have a share check context and a reference for
+			 * another inode, we can't exit immediately. This is
+			 * because even if this is a BTRFS_ADD_DELAYED_REF
+			 * reference we may find next a BTRFS_DROP_DELAYED_REF
+			 * which cancels out this ADD reference.
+			 *
+			 * If this is a DROP reference and there was no previous
+			 * ADD reference, then we need to signal that when we
+			 * process references from the extent tree (through
+			 * add_inline_refs() and add_keyed_refs()), we should
+			 * not exit early if we find a reference for another
+			 * inode, because one of the delayed DROP references
+			 * may cancel that reference in the extent tree.
 			 */
-			if (sc && sc->inum && ref->objectid != sc->inum) {
-				ret = BACKREF_FOUND_SHARED;
-				goto out;
-			}
+			if (sc && count < 0)
+				sc->have_delayed_delete_refs = true;
 
 			ret = add_indirect_ref(fs_info, preftrees, ref->root,
 					       &key, 0, node->bytenr, count, sc,
@@ -920,7 +930,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 	}
 	if (!ret)
 		ret = extent_is_shared(sc);
-out:
+
 	spin_unlock(&head->lock);
 	return ret;
 }
@@ -1023,7 +1033,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (sc && sc->inum && key.objectid != sc->inum) {
+			if (sc && sc->inum && key.objectid != sc->inum &&
+			    !sc->have_delayed_delete_refs) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
 			}
@@ -1033,6 +1044,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			ret = add_indirect_ref(fs_info, preftrees, root,
 					       &key, 0, bytenr, count,
 					       sc, GFP_NOFS);
+
 			break;
 		}
 		default:
@@ -1122,7 +1134,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (sc && sc->inum && key.objectid != sc->inum) {
+			if (sc && sc->inum && key.objectid != sc->inum &&
+			    !sc->have_delayed_delete_refs) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
 			}
@@ -1661,6 +1674,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 		.root_objectid = root->root_key.objectid,
 		.inum = inum,
 		.share_count = 0,
+		.have_delayed_delete_refs = false,
 	};
 	int level;
 
@@ -1726,6 +1740,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 			break;
 		}
 		shared.share_count = 0;
+		shared.have_delayed_delete_refs = false;
 		cond_resched();
 	}
 
-- 
cgit v1.2.3


From 943553ef9b51db303ab2b955c1025261abfdf6fb Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:52 +0100
Subject: btrfs: fix processing of delayed tree block refs during backref
 walking

During backref walking, when processing a delayed reference with a type of
BTRFS_TREE_BLOCK_REF_KEY, we have two bugs there:

1) We are accessing the delayed references extent_op, and its key, without
   the protection of the delayed ref head's lock;

2) If there's no extent op for the delayed ref head, we end up with an
   uninitialized key in the stack, variable 'tmp_op_key', and then pass
   it to add_indirect_ref(), which adds the reference to the indirect
   refs rb tree.

   This is wrong, because indirect references should have a NULL key
   when we don't have access to the key, and in that case they should be
   added to the indirect_missing_keys rb tree and not to the indirect rb
   tree.

   This means that if have BTRFS_TREE_BLOCK_REF_KEY delayed ref resulting
   from freeing an extent buffer, therefore with a count of -1, it will
   not cancel out the corresponding reference we have in the extent tree
   (with a count of 1), since both references end up in different rb
   trees.

   When using fiemap, where we often need to check if extents are shared
   through shared subtrees resulting from snapshots, it means we can
   incorrectly report an extent as shared when it's no longer shared.
   However this is temporary because after the transaction is committed
   the extent is no longer reported as shared, as running the delayed
   reference results in deleting the tree block reference from the extent
   tree.

   Outside the fiemap context, the result is unpredictable, as the key was
   not initialized but it's used when navigating the rb trees to insert
   and search for references (prelim_ref_compare()), and we expect all
   references in the indirect rb tree to have valid keys.

The following reproducer triggers the second bug:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdj
   MNT=/mnt/sdj

   mkfs.btrfs -f $DEV
   mount -o compress $DEV $MNT

   # With a compressed 128M file we get a tree height of 2 (level 1 root).
   xfs_io -f -c "pwrite -b 1M 0 128M" $MNT/foo

   btrfs subvolume snapshot $MNT $MNT/snap

   # Fiemap should output 0x2008 in the flags column.
   # 0x2000 means shared extent
   # 0x8 means encoded extent (because it's compressed)
   echo
   echo "fiemap after snapshot, range [120M, 120M + 128K):"
   xfs_io -c "fiemap -v 120M 128K" $MNT/foo
   echo

   # Overwrite one extent and fsync to flush delalloc and COW a new path
   # in the snapshot's tree.
   #
   # After this we have a BTRFS_DROP_DELAYED_REF delayed ref of type
   # BTRFS_TREE_BLOCK_REF_KEY with a count of -1 for every COWed extent
   # buffer in the path.
   #
   # In the extent tree we have inline references of type
   # BTRFS_TREE_BLOCK_REF_KEY, with a count of 1, for the same extent
   # buffers, so they should cancel each other, and the extent buffers in
   # the fs tree should no longer be considered as shared.
   #
   echo "Overwriting file range [120M, 120M + 128K)..."
   xfs_io -c "pwrite -b 128K 120M 128K" $MNT/snap/foo
   xfs_io -c "fsync" $MNT/snap/foo

   # Fiemap should output 0x8 in the flags column. The extent in the range
   # [120M, 120M + 128K) is no longer shared, it's now exclusive to the fs
   # tree.
   echo
   echo "fiemap after overwrite range [120M, 120M + 128K):"
   xfs_io -c "fiemap -v 120M 128K" $MNT/foo
   echo

   umount $MNT

Running it before this patch:

   $ ./test.sh
   (...)
   wrote 134217728/134217728 bytes at offset 0
   128 MiB, 128 ops; 0.1152 sec (1.085 GiB/sec and 1110.5809 ops/sec)
   Create a snapshot of '/mnt/sdj' in '/mnt/sdj/snap'

   fiemap after snapshot, range [120M, 120M + 128K):
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [245760..246015]: 34304..34559       256 0x2008

   Overwriting file range [120M, 120M + 128K)...
   wrote 131072/131072 bytes at offset 125829120
   128 KiB, 1 ops; 0.0001 sec (683.060 MiB/sec and 5464.4809 ops/sec)

   fiemap after overwrite range [120M, 120M + 128K):
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [245760..246015]: 34304..34559       256 0x2008

The extent in the range [120M, 120M + 128K) is still reported as shared
(0x2000 bit set) after overwriting that range and flushing delalloc, which
is not correct - an entire path was COWed in the snapshot's tree and the
extent is now only referenced by the original fs tree.

Running it after this patch:

   $ ./test.sh
   (...)
   wrote 134217728/134217728 bytes at offset 0
   128 MiB, 128 ops; 0.1198 sec (1.043 GiB/sec and 1068.2067 ops/sec)
   Create a snapshot of '/mnt/sdj' in '/mnt/sdj/snap'

   fiemap after snapshot, range [120M, 120M + 128K):
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [245760..246015]: 34304..34559       256 0x2008

   Overwriting file range [120M, 120M + 128K)...
   wrote 131072/131072 bytes at offset 125829120
   128 KiB, 1 ops; 0.0001 sec (694.444 MiB/sec and 5555.5556 ops/sec)

   fiemap after overwrite range [120M, 120M + 128K):
   /mnt/sdj/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [245760..246015]: 34304..34559       256   0x8

Now the extent is not reported as shared anymore.

So fix this by passing a NULL key pointer to add_indirect_ref() when
processing a delayed reference for a tree block if there's no extent op
for our delayed ref head with a defined key. Also access the extent op
only after locking the delayed ref head's lock.

The reproducer will be converted later to a test case for fstests.

Fixes: 86d5f994425252 ("btrfs: convert prelimary reference tracking to use rbtrees")
Fixes: a6dbceafb915e8 ("btrfs: Remove unused op_key var from add_delayed_refs")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index cf47dabb786f..4e29ccb234c0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -821,16 +821,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			    struct preftrees *preftrees, struct share_check *sc)
 {
 	struct btrfs_delayed_ref_node *node;
-	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct btrfs_key key;
-	struct btrfs_key tmp_op_key;
 	struct rb_node *n;
 	int count;
 	int ret = 0;
 
-	if (extent_op && extent_op->update_key)
-		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
 	spin_lock(&head->lock);
 	for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
 		node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -856,10 +851,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			/* NORMAL INDIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
+			struct btrfs_key *key_ptr = NULL;
+
+			if (head->extent_op && head->extent_op->update_key) {
+				btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+				key_ptr = &key;
+			}
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = add_indirect_ref(fs_info, preftrees, ref->root,
-					       &tmp_op_key, ref->level + 1,
+					       key_ptr, ref->level + 1,
 					       node->bytenr, count, sc,
 					       GFP_ATOMIC);
 			break;
-- 
cgit v1.2.3


From 63c84b46b3b75798f1ad63527b6250de00331907 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:53 +0100
Subject: btrfs: ignore fiemap path cache if we have multiple leaves for a data
 extent

The path cache used during fiemap used to determine the sharedness of
extent buffers in a path from a leaf containing a file extent item
pointing to our data extent up to the root node of the tree, is meant to
be used for a single path. Having a single path is by far the most common
case, and therefore worth to optimize for, but it's possible to actually
have multiple paths because we have 2 or more leaves.

If we have multiple leaves, the 'level' variable keeps getting incremented
in each iteration of the while loop at btrfs_is_data_extent_shared(),
which means we will treat the second leaf in the 'tmp' ulist as a level 1
node, and so forth. In the worst case this can lead to getting a level
greater than or equals to BTRFS_MAX_LEVEL (8), which will trigger a
WARN_ON_ONCE() in the functions to lookup from or store in the path cache
(lookup_backref_shared_cache() and store_backref_shared_cache()). If the
current level never goes beyond 8, due to shared nodes in the paths and
a fs tree height smaller than 8, it can still result in incorrectly
marking one leaf as shared because some other leaf is shared and is stored
one level below that other leaf, as when storing a true sharedness value
in the cache results in updating the sharedness to true of all entries in
the cache below the current level.

Having multiple leaves happens in a case like the following:

  - We have a file extent item point to data extent at bytenr X, for
    a file range [0, 1M[ for example;

  - At this moment we have an extent data ref for the extent, with
    an offset of 0 and a count of 1;

  - A write into the middle of the extent happens, file range [64K, 128K)
    so the file extent item is split into two (at btrfs_drop_extents()):

    1) One for file range [0, 64K), with a length (num_bytes field) of
       64K and an extent offset of 0;

    2) Another one for file range [128K, 1M), with a length of 896K
       (1M - 128K) and an extent offset of 128K.

  - At this moment the two file extent items are located in the same
    leaf;

  - A new file extent item for the range [64K, 128K), pointing to a new
    data extent, is inserted in the leaf. This results in a leaf split
    and now those two file extent items pointing to data extent X end
    up located in different leaves;

  - Once delayed refs are run, we still have a single extent data ref
    item for our data extent at bytenr X, for offset 0, but now with a
    count of 2 instead of 1;

  - So during fiemap, at btrfs_is_data_extent_shared(), after we call
    find_parent_nodes() for the data extent, we get two leaves, since
    we have two file extent items point to data extent at bytenr X that
    are located in two different leaves.

So skip the use of the path cache when we get more than one leaf.

Fixes: 12a824dc67a61e ("btrfs: speedup checking for extent sharedness during fiemap")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 25 +++++++++++++++++++++++++
 fs/btrfs/backref.h |  1 +
 2 files changed, 26 insertions(+)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4e29ccb234c0..4ec18ceb2f21 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1536,6 +1536,9 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache
 {
 	struct btrfs_backref_shared_cache_entry *entry;
 
+	if (!cache->use_cache)
+		return false;
+
 	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
 		return false;
 
@@ -1600,6 +1603,9 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
 	struct btrfs_backref_shared_cache_entry *entry;
 	u64 gen;
 
+	if (!cache->use_cache)
+		return;
+
 	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
 		return;
 
@@ -1697,6 +1703,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 	/* -1 means we are in the bytenr of the data extent. */
 	level = -1;
 	ULIST_ITER_INIT(&uiter);
+	cache->use_cache = true;
 	while (1) {
 		bool is_shared;
 		bool cached;
@@ -1726,6 +1733,24 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 		    extent_gen > btrfs_root_last_snapshot(&root->root_item))
 			break;
 
+		/*
+		 * If our data extent was not directly shared (without multiple
+		 * reference items), than it might have a single reference item
+		 * with a count > 1 for the same offset, which means there are 2
+		 * (or more) file extent items that point to the data extent -
+		 * this happens when a file extent item needs to be split and
+		 * then one item gets moved to another leaf due to a b+tree leaf
+		 * split when inserting some item. In this case the file extent
+		 * items may be located in different leaves and therefore some
+		 * of the leaves may be referenced through shared subtrees while
+		 * others are not. Since our extent buffer cache only works for
+		 * a single path (by far the most common case and simpler to
+		 * deal with), we can not use it if we have multiple leaves
+		 * (which implies multiple paths).
+		 */
+		if (level == -1 && tmp->nnodes > 1)
+			cache->use_cache = false;
+
 		if (level >= 0)
 			store_backref_shared_cache(cache, root, bytenr,
 						   level, false);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 52ae6957b414..8e69584d538d 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -29,6 +29,7 @@ struct btrfs_backref_shared_cache {
 	 * a given data extent should never exceed the maximum b+tree height.
 	 */
 	struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+	bool use_cache;
 };
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
-- 
cgit v1.2.3


From 5614dc3a47e3310fbc77ea3b67eaadd1c6417bf1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:37 +0000
Subject: btrfs: fix inode list leak during backref walking at
 resolve_indirect_refs()

During backref walking, at resolve_indirect_refs(), if we get an error
we jump to the 'out' label and call ulist_free() on the 'parents' ulist,
which frees all the elements in the ulist - however that does not free
any inode lists that may be attached to elements, through the 'aux' field
of a ulist node, so we end up leaking lists if we have any attached to
the unodes.

Fix this by calling free_leaf_list() instead of ulist_free() when we exit
from resolve_indirect_refs(). The static function free_leaf_list() is
moved up for this to be possible and it's slightly simplified by removing
unnecessary code.

Fixes: 3301958b7c1d ("Btrfs: add inodes before dropping the extent lock in find_all_leafs")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4ec18ceb2f21..40afae0af4e6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -648,6 +648,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
 	return (struct extent_inode_elem *)(uintptr_t)node->aux;
 }
 
+static void free_leaf_list(struct ulist *ulist)
+{
+	struct ulist_node *node;
+	struct ulist_iterator uiter;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((node = ulist_next(ulist, &uiter)))
+		free_inode_elem_list(unode_aux_to_inode_list(node));
+
+	ulist_free(ulist);
+}
+
 /*
  * We maintain three separate rbtrees: one for direct refs, one for
  * indirect refs which have a key, and one for indirect refs which do not
@@ -762,7 +774,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		cond_resched();
 	}
 out:
-	ulist_free(parents);
+	/*
+	 * We may have inode lists attached to refs in the parents ulist, so we
+	 * must free them before freeing the ulist and its refs.
+	 */
+	free_leaf_list(parents);
 	return ret;
 }
 
@@ -1409,24 +1425,6 @@ out:
 	return ret;
 }
 
-static void free_leaf_list(struct ulist *blocks)
-{
-	struct ulist_node *node = NULL;
-	struct extent_inode_elem *eie;
-	struct ulist_iterator uiter;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((node = ulist_next(blocks, &uiter))) {
-		if (!node->aux)
-			continue;
-		eie = unode_aux_to_inode_list(node);
-		free_inode_elem_list(eie);
-		node->aux = 0;
-	}
-
-	ulist_free(blocks);
-}
-
 /*
  * Finds all leafs with a reference to the specified combination of bytenr and
  * offset. key_list_head will point to a list of corresponding keys (caller must
-- 
cgit v1.2.3


From 92876eec382a0f19f33d09d2c939e9ca49038ae5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:38 +0000
Subject: btrfs: fix inode list leak during backref walking at
 find_parent_nodes()

During backref walking, at find_parent_nodes(), if we are dealing with a
data extent and we get an error while resolving the indirect backrefs, at
resolve_indirect_refs(), or in the while loop that iterates over the refs
in the direct refs rbtree, we end up leaking the inode lists attached to
the direct refs we have in the direct refs rbtree that were not yet added
to the refs ulist passed as argument to find_parent_nodes(). Since they
were not yet added to the refs ulist and prelim_release() does not free
the lists, on error the caller can only free the lists attached to the
refs that were added to the refs ulist, all the remaining refs get their
inode lists never freed, therefore leaking their memory.

Fix this by having prelim_release() always free any attached inode list
to each ref found in the rbtree, and have find_parent_nodes() set the
ref's inode list to NULL once it transfers ownership of the inode list
to a ref added to the refs ulist passed to find_parent_nodes().

Fixes: 86d5f9944252 ("btrfs: convert prelimary reference tracking to use rbtrees")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/backref.c')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 40afae0af4e6..18374a6d05bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -289,8 +289,10 @@ static void prelim_release(struct preftree *preftree)
 	struct prelim_ref *ref, *next_ref;
 
 	rbtree_postorder_for_each_entry_safe(ref, next_ref,
-					     &preftree->root.rb_root, rbnode)
+					     &preftree->root.rb_root, rbnode) {
+		free_inode_elem_list(ref->inode_list);
 		free_pref(ref);
+	}
 
 	preftree->root = RB_ROOT_CACHED;
 	preftree->count = 0;
@@ -1384,6 +1386,12 @@ again:
 				if (ret < 0)
 					goto out;
 				ref->inode_list = eie;
+				/*
+				 * We transferred the list ownership to the ref,
+				 * so set to NULL to avoid a double free in case
+				 * an error happens after this.
+				 */
+				eie = NULL;
 			}
 			ret = ulist_add_merge_ptr(refs, ref->parent,
 						  ref->inode_list,
@@ -1409,6 +1417,14 @@ again:
 				eie->next = ref->inode_list;
 			}
 			eie = NULL;
+			/*
+			 * We have transferred the inode list ownership from
+			 * this ref to the ref we added to the 'refs' ulist.
+			 * So set this ref's inode list to NULL to avoid
+			 * use-after-free when our caller uses it or double
+			 * frees in case an error happens before we return.
+			 */
+			ref->inode_list = NULL;
 		}
 		cond_resched();
 	}
-- 
cgit v1.2.3