From 53fd5ffbb5197b8cc2d73d2bbc0f688afd45736c Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 7 Jun 2022 10:12:26 -0700
Subject: ocfs2: kill EBUSY from dlmfs_evict_inode

When unlinking a dlmfs, first it will invoke dlmfs_unlink(), and then
invoke dlmfs_evict_inode(), user_dlm_destroy_lock() is invoked in both
places, the second one from dlmfs_evict_inode() will get EBUSY error
because USER_LOCK_IN_TEARDOWN is already set in lockres.  This doesn't
affect any function, just the error log is annoying.

Link: https://lkml.kernel.org/r/20220607171226.86672-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlmfs/dlmfs.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index e360543ad7e7..8b2020f92b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
 	struct dlmfs_inode_private *ip;
+	struct user_lock_res *lockres;
+	int teardown;
 
 	clear_inode(inode);
 
 	mlog(0, "inode %lu\n", inode->i_ino);
 
 	ip = DLMFS_I(inode);
+	lockres = &ip->ip_lockres;
 
 	if (S_ISREG(inode->i_mode)) {
-		status = user_dlm_destroy_lock(&ip->ip_lockres);
-		if (status < 0)
-			mlog_errno(status);
+		spin_lock(&lockres->l_lock);
+		teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
+		spin_unlock(&lockres->l_lock);
+		if (!teardown) {
+			status = user_dlm_destroy_lock(lockres);
+			if (status < 0)
+				mlog_errno(status);
+		}
 		iput(ip->ip_parent);
 		goto clear_fields;
 	}
-- 
cgit v1.2.3


From dabba87229411a5e9d20ac03ffc36463c53ae672 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 27 May 2022 02:55:34 +0000
Subject: fs/kernel_read_file: allow to read files up-to ssize_t

Patch series "Allow to kexec with initramfs larger than 2G", v2.

Currently, the largest initramfs that is supported by kexec_file_load()
syscall is 2G.

This is because kernel_read_file() returns int, and is limited to INT_MAX
or 2G.

On the other hand, there are kexec based boot loaders (i.e.  u-root), that
may need to boot netboot images that might be larger than 2G.

The first patch changes the return type from int to ssize_t in
kernel_read_file* functions.

The second patch increases the maximum initramfs file size to 4G.

Tested: verified that can kexec_file_load() works with 4G initramfs
on x86_64.


This patch (of 2):

Currently, the maximum file size that is supported is 2G.  This may be too
small in some cases.  For example, kexec_file_load() system call loads
initramfs.  In some netboot cases initramfs can be rather large.

Allow to use up-to ssize_t bytes.  The callers still can limit the maximum
file size via buf_size.

Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/kernel_read_file.c            | 38 +++++++++++++++++++-------------------
 include/linux/kernel_read_file.h | 32 ++++++++++++++++----------------
 include/linux/limits.h           |  1 +
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b9..5d826274570c 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
  * change between calls to kernel_read_file().
  *
  * Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
  *
  */
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
-		     size_t buf_size, size_t *file_size,
-		     enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+			 size_t buf_size, size_t *file_size,
+			 enum kernel_read_file_id id)
 {
 	loff_t i_size, pos;
-	size_t copied;
+	ssize_t copied;
 	void *allocated = NULL;
 	bool whole_file;
 	int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
 		goto out;
 	}
 	/* The file is too big for sane activities. */
-	if (i_size > INT_MAX) {
+	if (i_size > SSIZE_MAX) {
 		ret = -EFBIG;
 		goto out;
 	}
@@ -124,12 +124,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
-			       size_t buf_size, size_t *file_size,
-			       enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+				   size_t buf_size, size_t *file_size,
+				   enum kernel_read_file_id id)
 {
 	struct file *file;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id)
 {
 	struct file *file;
 	struct path root;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
 
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
-			     size_t buf_size, size_t *file_size,
-			     enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+				 size_t buf_size, size_t *file_size,
+				 enum kernel_read_file_id id)
 {
 	struct fd f = fdget(fd);
-	int ret = -EBADF;
+	ssize_t ret = -EBADF;
 
 	if (!f.file || !(f.file->f_mode & FMODE_READ))
 		goto out;
diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 575ffa1031d3..90451e2e12bd 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
-int kernel_read_file(struct file *file, loff_t offset,
-		     void **buf, size_t buf_size,
-		     size_t *file_size,
-		     enum kernel_read_file_id id);
-int kernel_read_file_from_path(const char *path, loff_t offset,
-			       void **buf, size_t buf_size,
-			       size_t *file_size,
-			       enum kernel_read_file_id id);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id);
-int kernel_read_file_from_fd(int fd, loff_t offset,
-			     void **buf, size_t buf_size,
-			     size_t *file_size,
-			     enum kernel_read_file_id id);
+ssize_t kernel_read_file(struct file *file, loff_t offset,
+			 void **buf, size_t buf_size,
+			 size_t *file_size,
+			 enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset,
+				   void **buf, size_t buf_size,
+				   size_t *file_size,
+				   enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset,
+				 void **buf, size_t buf_size,
+				 size_t *file_size,
+				 enum kernel_read_file_id id);
 
 #endif /* _LINUX_KERNEL_READ_FILE_H */
diff --git a/include/linux/limits.h b/include/linux/limits.h
index b568b9c30bbf..f6bcc9369010 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -7,6 +7,7 @@
 #include <vdso/limits.h>
 
 #define SIZE_MAX	(~(size_t)0)
+#define SSIZE_MAX	((ssize_t)(SIZE_MAX >> 1))
 #define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
 #define U8_MAX		((u8)~0U)
-- 
cgit v1.2.3


From f268eedddf3595e85f8883dc50aed29654785696 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sat, 11 Jun 2022 04:21:32 +0100
Subject: squashfs: extend "page actor" to handle missing pages

Patch series "Squashfs: handle missing pages decompressing into page
cache".

This patchset enables Squashfs to handle missing pages when directly
decompressing datablocks into the page cache.

Previously if the full set of pages needed was not available, Squashfs
would have to fall back to using an intermediate buffer (the older
method), which is slower, involving a memcopy, and it introduces
contention on a shared buffer.

The first patch extends the "page actor" code to handle missing pages.

The second patch updates Squashfs_readpage_block() to use the new
functionality, and removes the code that falls back to using an
intermediate buffer.

This patchset is independent of the readahead work, and it is standalone.
It can be merged on its own.

But the readahead patch for efficiency also needs this patch-set.


This patch (of 2):

This patch extends the "page actor" code to handle missing pages.

Previously if the full set of pages needed to decompress a Squashfs
datablock was unavailable, this would cause decompression to fail on the
missing pages.

In this case direct decompression into the page cache could not be
achieved and the code would fall back to using the older intermediate
buffer method.

With this patch, direct decompression into the page cache can be achieved
with missing pages.

For "multi-shot" decompressors (zlib, xz, zstd), the page actor will
allocate a temporary buffer which is passed to the decompressor, and then
freed by the page actor.

For "single shot" decompressors (lz4, lzo) which decompress into a
contiguous "bounce buffer", and which is then copied into the page cache,
it would be pointless to allocate a temporary buffer, memcpy into it, and
then free it.  For these decompressors -ENOMEM is returned, which
signifies that the memcpy for that page should be skipped.

This also happens if the data block is uncompressed.

Link: https://lkml.kernel.org/r/20220611032133.5743-1-phillip@squashfs.org.uk
Link: https://lkml.kernel.org/r/20220611032133.5743-2-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/block.c        | 10 ++++++---
 fs/squashfs/decompressor.h |  1 +
 fs/squashfs/file_direct.c  | 21 +++++++++---------
 fs/squashfs/lz4_wrapper.c  |  7 ++++--
 fs/squashfs/lzo_wrapper.c  |  7 ++++--
 fs/squashfs/page_actor.c   | 55 +++++++++++++++++++++++++++++++++++++++-------
 fs/squashfs/page_actor.h   | 21 +++++++++++++++---
 fs/squashfs/xz_wrapper.c   | 11 +++++++++-
 fs/squashfs/zlib_wrapper.c | 12 +++++++++-
 fs/squashfs/zstd_wrapper.c | 12 +++++++++-
 10 files changed, 126 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8879d052f96c..833aca92301f 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio,
 			     struct squashfs_page_actor *actor,
 			     int offset, int req_length)
 {
-	void *actor_addr = squashfs_first_page(actor);
+	void *actor_addr;
 	struct bvec_iter_all iter_all = {};
 	struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
 	int copied_bytes = 0;
 	int actor_offset = 0;
 
+	squashfs_actor_nobuff(actor);
+	actor_addr = squashfs_first_page(actor);
+
 	if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
 		return 0;
 
@@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio,
 
 		bytes_to_copy = min_t(int, bytes_to_copy,
 				      req_length - copied_bytes);
-		memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
-		       bytes_to_copy);
+		if (!IS_ERR(actor_addr))
+			memcpy(actor_addr + actor_offset, bvec_virt(bvec) +
+					offset, bytes_to_copy);
 
 		actor_offset += bytes_to_copy;
 		copied_bytes += bytes_to_copy;
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 1b9ccfd0aa51..19ab60834389 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -20,6 +20,7 @@ struct squashfs_decompressor {
 		struct bio *, int, int, struct squashfs_page_actor *);
 	int	id;
 	char	*name;
+	int	alloc_buffer;
 	int	supported;
 };
 
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index a4894cc59447..5af5802f5626 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -47,14 +47,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	if (page == NULL)
 		return res;
 
-	/*
-	 * Create a "page actor" which will kmap and kunmap the
-	 * page cache pages appropriately within the decompressor
-	 */
-	actor = squashfs_page_actor_init_special(page, pages, 0);
-	if (actor == NULL)
-		goto out;
-
 	/* Try to grab all the pages covered by the Squashfs block */
 	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
 		page[i] = (n == target_page->index) ? target_page :
@@ -89,8 +81,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		goto out;
 	}
 
+	/*
+	 * Create a "page actor" which will kmap and kunmap the
+	 * page cache pages appropriately within the decompressor
+	 */
+	actor = squashfs_page_actor_init_special(msblk, page, pages, 0);
+	if (actor == NULL)
+		goto out;
+
 	/* Decompress directly into the page cache buffers */
 	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+	kfree(actor);
+
 	if (res < 0)
 		goto mark_errored;
 
@@ -116,7 +119,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 			put_page(page[i]);
 	}
 
-	kfree(actor);
 	kfree(page);
 
 	return 0;
@@ -135,7 +137,6 @@ mark_errored:
 	}
 
 out:
-	kfree(actor);
 	kfree(page);
 	return res;
 }
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index b685b6238316..49797729f143 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	buff = stream->output;
 	while (data) {
 		if (bytes <= PAGE_SIZE) {
-			memcpy(data, buff, bytes);
+			if (!IS_ERR(data))
+				memcpy(data, buff, bytes);
 			break;
 		}
-		memcpy(data, buff, PAGE_SIZE);
+		if (!IS_ERR(data))
+			memcpy(data, buff, PAGE_SIZE);
 		buff += PAGE_SIZE;
 		bytes -= PAGE_SIZE;
 		data = squashfs_next_page(output);
@@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = {
 	.decompress = lz4_uncompress,
 	.id = LZ4_COMPRESSION,
 	.name = "lz4",
+	.alloc_buffer = 0,
 	.supported = 1
 };
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index cb510a631968..d216aeefa865 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	buff = stream->output;
 	while (data) {
 		if (bytes <= PAGE_SIZE) {
-			memcpy(data, buff, bytes);
+			if (!IS_ERR(data))
+				memcpy(data, buff, bytes);
 			break;
 		} else {
-			memcpy(data, buff, PAGE_SIZE);
+			if (!IS_ERR(data))
+				memcpy(data, buff, PAGE_SIZE);
 			buff += PAGE_SIZE;
 			bytes -= PAGE_SIZE;
 			data = squashfs_next_page(output);
@@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = {
 	.decompress = lzo_uncompress,
 	.id = LZO_COMPRESSION,
 	.name = "lzo",
+	.alloc_buffer = 0,
 	.supported = 1
 };
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 520d323a99ce..b23b780d8f42 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -7,6 +7,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
 #include "page_actor.h"
 
 /*
@@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
 }
 
 /* Implementation of page_actor for decompressing directly into page cache. */
+static void *handle_next_page(struct squashfs_page_actor *actor)
+{
+	int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	if (actor->returned_pages == max_pages)
+		return NULL;
+
+	if ((actor->next_page == actor->pages) ||
+			(actor->next_index != actor->page[actor->next_page]->index)) {
+		if (actor->alloc_buffer) {
+			void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+			if (tmp_buffer) {
+				actor->tmp_buffer = tmp_buffer;
+				actor->next_index++;
+				actor->returned_pages++;
+				return tmp_buffer;
+			}
+		}
+
+		actor->next_index++;
+		actor->returned_pages++;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	actor->next_index++;
+	actor->returned_pages++;
+	return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
+}
+
 static void *direct_first_page(struct squashfs_page_actor *actor)
 {
-	actor->next_page = 1;
-	return actor->pageaddr = kmap_atomic(actor->page[0]);
+	return handle_next_page(actor);
 }
 
 static void *direct_next_page(struct squashfs_page_actor *actor)
 {
 	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
+		kunmap_local(actor->pageaddr);
+
+	kfree(actor->tmp_buffer);
+	actor->pageaddr = actor->tmp_buffer = NULL;
 
-	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
-		kmap_atomic(actor->page[actor->next_page++]);
+	return handle_next_page(actor);
 }
 
 static void direct_finish_page(struct squashfs_page_actor *actor)
 {
 	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
+		kunmap_local(actor->pageaddr);
+
+	kfree(actor->tmp_buffer);
 }
 
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
-	int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
+	struct page **page, int pages, int length)
 {
 	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
@@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
 	actor->page = page;
 	actor->pages = pages;
 	actor->next_page = 0;
+	actor->returned_pages = 0;
+	actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
 	actor->pageaddr = NULL;
+	actor->tmp_buffer = NULL;
+	actor->alloc_buffer = msblk->decompressor->alloc_buffer;
 	actor->squashfs_first_page = direct_first_page;
 	actor->squashfs_next_page = direct_next_page;
 	actor->squashfs_finish_page = direct_finish_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 2e3073ace009..37523c54256f 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -45,6 +45,11 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	/* empty */
 }
+
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
 #else
 struct squashfs_page_actor {
 	union {
@@ -52,17 +57,23 @@ struct squashfs_page_actor {
 		struct page	**page;
 	};
 	void	*pageaddr;
+	void	*tmp_buffer;
 	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
 	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
 	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
 	int	pages;
 	int	length;
 	int	next_page;
+	int	alloc_buffer;
+	int	returned_pages;
+	pgoff_t	next_index;
 };
 
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
-							 **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+				int pages, int length);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(
+				struct squashfs_sb_info *msblk,
+				struct page **page, int pages, int length);
 static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
 {
 	return actor->squashfs_first_page(actor);
@@ -75,5 +86,9 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	actor->squashfs_finish_page(actor);
 }
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+	actor->alloc_buffer = 0;
+}
 #endif
 #endif
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 68f6d09bb3a2..6c49481a2f8c 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->buf.out_pos = 0;
 	stream->buf.out_size = PAGE_SIZE;
 	stream->buf.out = squashfs_first_page(output);
+	if (IS_ERR(stream->buf.out)) {
+		error = PTR_ERR(stream->buf.out);
+		goto finish;
+	}
 
 	for (;;) {
 		enum xz_ret xz_err;
@@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->buf.out_pos == stream->buf.out_size) {
 			stream->buf.out = squashfs_next_page(output);
-			if (stream->buf.out != NULL) {
+			if (IS_ERR(stream->buf.out)) {
+				error = PTR_ERR(stream->buf.out);
+				break;
+			} else if (stream->buf.out != NULL) {
 				stream->buf.out_pos = 0;
 				total += PAGE_SIZE;
 			}
@@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
 	squashfs_finish_page(output);
 
 	return error ? error : total + stream->buf.out_pos;
@@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = {
 	.decompress = squashfs_xz_uncompress,
 	.id = XZ_COMPRESSION,
 	.name = "xz",
+	.alloc_buffer = 1,
 	.supported = 1
 };
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index a20e9042146b..cbb7afe7bc46 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->next_out = squashfs_first_page(output);
 	stream->avail_in = 0;
 
+	if (IS_ERR(stream->next_out)) {
+		error = PTR_ERR(stream->next_out);
+		goto finish;
+	}
+
 	for (;;) {
 		int zlib_err;
 
@@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->avail_out == 0) {
 			stream->next_out = squashfs_next_page(output);
-			if (stream->next_out != NULL)
+			if (IS_ERR(stream->next_out)) {
+				error = PTR_ERR(stream->next_out);
+				break;
+			} else if (stream->next_out != NULL)
 				stream->avail_out = PAGE_SIZE;
 		}
 
@@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
 	squashfs_finish_page(output);
 
 	if (!error)
@@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = {
 	.decompress = zlib_uncompress,
 	.id = ZLIB_COMPRESSION,
 	.name = "zlib",
+	.alloc_buffer = 1,
 	.supported = 1
 };
 
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index c40445dbf38c..0e407c4d8b3b 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 	out_buf.size = PAGE_SIZE;
 	out_buf.dst = squashfs_first_page(output);
+	if (IS_ERR(out_buf.dst)) {
+		error = PTR_ERR(out_buf.dst);
+		goto finish;
+	}
 
 	for (;;) {
 		size_t zstd_err;
@@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (out_buf.pos == out_buf.size) {
 			out_buf.dst = squashfs_next_page(output);
-			if (out_buf.dst == NULL) {
+			if (IS_ERR(out_buf.dst)) {
+				error = PTR_ERR(out_buf.dst);
+				break;
+			} else if (out_buf.dst == NULL) {
 				/* Shouldn't run out of pages
 				 * before stream is done.
 				 */
@@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		}
 	}
 
+finish:
+
 	squashfs_finish_page(output);
 
 	return error ? error : total_out;
@@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = {
 	.decompress = zstd_uncompress,
 	.id = ZSTD_COMPRESSION,
 	.name = "zstd",
+	.alloc_buffer = 1,
 	.supported = 1
 };
-- 
cgit v1.2.3


From 1bb1a07afad97303f14b8d1b319b03f1f01a0091 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sat, 11 Jun 2022 04:21:33 +0100
Subject: squashfs: don't use intermediate buffer if pages missing

Now that the "page actor" can handle missing pages, we don't have to fall
back to using an intermediate buffer in Squashfs_readpage_block() if all
the pages necessary can't be obtained.

Link: https://lkml.kernel.org/r/20220611032133.5743-3-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file_direct.c | 75 ++++++++---------------------------------------
 1 file changed, 12 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 5af5802f5626..be4b12d31e0c 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -18,9 +18,6 @@
 #include "squashfs.h"
 #include "page_actor.h"
 
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page, int bytes);
-
 /* Read separately compressed datablock directly into page cache */
 int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	int expected)
@@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
 	int start_index = target_page->index & ~mask;
 	int end_index = start_index | mask;
-	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+	int i, n, pages, bytes, res = -ENOMEM;
 	struct page **page;
 	struct squashfs_page_actor *actor;
 	void *pageaddr;
@@ -48,44 +45,29 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		return res;
 
 	/* Try to grab all the pages covered by the Squashfs block */
-	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+	for (i = 0, n = start_index; n <= end_index; n++) {
 		page[i] = (n == target_page->index) ? target_page :
 			grab_cache_page_nowait(target_page->mapping, n);
 
-		if (page[i] == NULL) {
-			missing_pages++;
+		if (page[i] == NULL)
 			continue;
-		}
 
 		if (PageUptodate(page[i])) {
 			unlock_page(page[i]);
 			put_page(page[i]);
-			page[i] = NULL;
-			missing_pages++;
+			continue;
 		}
-	}
-
-	if (missing_pages) {
-		/*
-		 * Couldn't get one or more pages, this page has either
-		 * been VM reclaimed, but others are still in the page cache
-		 * and uptodate, or we're racing with another thread in
-		 * squashfs_readpage also trying to grab them.  Fall back to
-		 * using an intermediate buffer.
-		 */
-		res = squashfs_read_cache(target_page, block, bsize, pages,
-							page, expected);
-		if (res < 0)
-			goto mark_errored;
 
-		goto out;
+		i++;
 	}
 
+	pages = i;
+
 	/*
 	 * Create a "page actor" which will kmap and kunmap the
 	 * page cache pages appropriately within the decompressor
 	 */
-	actor = squashfs_page_actor_init_special(msblk, page, pages, 0);
+	actor = squashfs_page_actor_init_special(msblk, page, pages, expected);
 	if (actor == NULL)
 		goto out;
 
@@ -102,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 		goto mark_errored;
 	}
 
-	/* Last page may have trailing bytes not filled */
+	/* Last page (if present) may have trailing bytes not filled */
 	bytes = res % PAGE_SIZE;
-	if (bytes) {
-		pageaddr = kmap_atomic(page[pages - 1]);
+	if (page[pages - 1]->index == end_index && bytes) {
+		pageaddr = kmap_local_page(page[pages - 1]);
 		memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
-		kunmap_atomic(pageaddr);
+		kunmap_local(pageaddr);
 	}
 
 	/* Mark pages as uptodate, unlock and release */
@@ -140,36 +122,3 @@ out:
 	kfree(page);
 	return res;
 }
-
-
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page, int bytes)
-{
-	struct inode *i = target_page->mapping->host;
-	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-						 block, bsize);
-	int res = buffer->error, n, offset = 0;
-
-	if (res) {
-		ERROR("Unable to read page, block %llx, size %x\n", block,
-			bsize);
-		goto out;
-	}
-
-	for (n = 0; n < pages && bytes > 0; n++,
-			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
-		int avail = min_t(int, bytes, PAGE_SIZE);
-
-		if (page[n] == NULL)
-			continue;
-
-		squashfs_fill_page(page[n], buffer, offset, avail);
-		unlock_page(page[n]);
-		if (page[n] != target_page)
-			put_page(page[n]);
-	}
-
-out:
-	squashfs_cache_put(buffer);
-	return res;
-}
-- 
cgit v1.2.3


From 019a0c9e377c9f7bd477a0742706d93cdddaee4d Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 10 Jun 2022 09:57:18 +0200
Subject: fat: add a vfat_rename2() and make existing .rename callback a helper

Patch series "fat: add support for the renameat2 RENAME_EXCHANGE flag", v6.

The series adds support for the renameat2 system call RENAME_EXCHANGE flag
(which allows to atomically replace two paths) to the vfat filesystem
code.

There are many use cases for this, but we are particularly interested in
making possible for vfat filesystems to be part of OSTree [0] deployments.

Currently OSTree relies on symbolic links to make the deployment updates
an atomic transactional operation.  But RENAME_EXCHANGE could be used [1]
to achieve a similar level of robustness when using a vfat filesystem.

Patch #1 is just a preparatory patch to introduce the RENAME_EXCHANGE
support, patch #2 moves some code blocks in vfat_rename() to a set of
helper functions, that can be reused by tvfat_rename_exchange() that's
added by patch #3 and finally patch #4 adds some kselftests to test it.


This patch (of 4):

Currently vfat only supports the RENAME_NOREPLACE flag which is handled by
the virtual file system layer but doesn't support the RENAME_EXCHANGE
flag.

Add a vfat_rename2() function to be used as the .rename callback and move
the current vfat_rename() handler to a helper.  This is in preparation for
implementing the RENAME_NOREPLACE flag using a different helper function.

Link: https://lkml.kernel.org/r/20220610075721.1182745-1-javierm@redhat.com
Link: https://lkml.kernel.org/r/20220610075721.1182745-2-javierm@redhat.com
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Peter Jones <pjones@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c573314806cf..88ccb2ee3537 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -889,9 +889,8 @@ out:
 	return err;
 }
 
-static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
-		       struct dentry *old_dentry, struct inode *new_dir,
-		       struct dentry *new_dentry, unsigned int flags)
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct buffer_head *dotdot_bh;
 	struct msdos_dir_entry *dotdot_de;
@@ -902,9 +901,6 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = d_inode(old_dentry);
 	new_inode = d_inode(new_dentry);
@@ -1021,13 +1017,24 @@ error_inode:
 	goto out;
 }
 
+static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+			struct dentry *old_dentry, struct inode *new_dir,
+			struct dentry *new_dentry, unsigned int flags)
+{
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	/* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
+	return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static const struct inode_operations vfat_dir_inode_operations = {
 	.create		= vfat_create,
 	.lookup		= vfat_lookup,
 	.unlink		= vfat_unlink,
 	.mkdir		= vfat_mkdir,
 	.rmdir		= vfat_rmdir,
-	.rename		= vfat_rename,
+	.rename		= vfat_rename2,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
 	.update_time	= fat_update_time,
-- 
cgit v1.2.3


From 204d03203a145b443cd8676dc12dbb47e1a3751f Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 10 Jun 2022 09:57:19 +0200
Subject: fat: factor out reusable code in vfat_rename() as helper functions

The vfat_rename() function is quite big and there are code blocks that can
be moved into helper functions.  This not only simplify the implementation
of that function but also allows these helpers to be reused.

For example, the helpers can be used by the handler of the RENAME_EXCHANGE
flag once this is implemented in a subsequent patch.

Link: https://lkml.kernel.org/r/20220610075721.1182745-3-javierm@redhat.com
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 89 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 88ccb2ee3537..9c04053a8f1c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -889,16 +889,55 @@ out:
 	return err;
 }
 
+static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
+			      struct msdos_dir_entry **de)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		if (fat_get_dotdot_entry(inode, bh, de))
+			return -EIO;
+	}
+	return 0;
+}
+
+static int vfat_sync_ipos(struct inode *dir, struct inode *inode)
+{
+	if (IS_DIRSYNC(dir))
+		return fat_sync_inode(inode);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode,
+				 struct buffer_head *dotdot_bh,
+				 struct msdos_dir_entry *dotdot_de)
+{
+	fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart);
+	mark_buffer_dirty_inode(dotdot_bh, inode);
+	if (IS_DIRSYNC(dir))
+		return sync_dirty_buffer(dotdot_bh);
+	return 0;
+}
+
+static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts)
+{
+	inode_inc_iversion(dir);
+	fat_truncate_time(dir, ts, S_CTIME | S_MTIME);
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+}
+
 static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
+	struct msdos_dir_entry *dotdot_de = NULL;
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
 	struct timespec64 ts;
 	loff_t new_i_pos;
-	int err, is_dir, update_dotdot, corrupt = 0;
+	int err, is_dir, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -909,15 +948,13 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		goto out;
 
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
-			err = -EIO;
+	if (old_dir != new_dir) {
+		err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de);
+		if (err)
 			goto out;
-		}
 	}
 
+	is_dir = S_ISDIR(old_inode->i_mode);
 	ts = current_time(old_dir);
 	if (new_inode) {
 		if (is_dir) {
@@ -938,21 +975,15 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	fat_detach(old_inode);
 	fat_attach(old_inode, new_i_pos);
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
+	err = vfat_sync_ipos(new_dir, old_inode);
+	if (err)
+		goto error_inode;
 
-	if (update_dotdot) {
-		fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
+	if (dotdot_de) {
+		err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh,
+					    dotdot_de);
+		if (err)
+			goto error_dotdot;
 		drop_nlink(old_dir);
 		if (!new_inode)
  			inc_nlink(new_dir);
@@ -962,12 +993,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_sinfo.bh = NULL;
 	if (err)
 		goto error_dotdot;
-	inode_inc_iversion(old_dir);
-	fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
+	vfat_update_dir_metadata(old_dir, &ts);
 
 	if (new_inode) {
 		drop_nlink(new_inode);
@@ -987,10 +1013,9 @@ error_dotdot:
 	/* data cluster is shared, serious corruption */
 	corrupt = 1;
 
-	if (update_dotdot) {
-		fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
-		mark_buffer_dirty_inode(dotdot_bh, old_inode);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
+	if (dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh,
+						 dotdot_de);
 	}
 error_inode:
 	fat_detach(old_inode);
-- 
cgit v1.2.3


From da87e1725ae2136baeb9aac04c572c283afc917f Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Fri, 10 Jun 2022 09:57:20 +0200
Subject: fat: add renameat2 RENAME_EXCHANGE flag support

The renameat2 RENAME_EXCHANGE flag allows to atomically exchange two paths
but is currently not supported by the Linux vfat filesystem driver.

Add a vfat_rename_exchange() helper function that implements this support.

The super block lock is acquired during the operation to ensure atomicity,
and in the error path actions made are reversed also with the mutex held.

It makes the operation as transactional as possible, within the limitation
impossed by vfat due not having a journal with logs to replay.

Link: https://lkml.kernel.org/r/20220610075721.1182745-4-javierm@redhat.com
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Christian Kellner <ckellner@redhat.com>
Cc: Chung-Chiang Cheng <cccheng@synology.com>
Cc: Colin Walters <walters@verbum.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Peter Jones <pjones@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 9c04053a8f1c..21620054e1c4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1042,13 +1042,134 @@ error_inode:
 	goto out;
 }
 
+static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode,
+			       loff_t old_i_pos, loff_t new_i_pos)
+{
+	fat_detach(old_inode);
+	fat_detach(new_inode);
+	fat_attach(old_inode, new_i_pos);
+	fat_attach(new_inode, old_i_pos);
+}
+
+static void vfat_move_nlink(struct inode *src, struct inode *dst)
+{
+	drop_nlink(src);
+	inc_nlink(dst);
+}
+
+static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+				struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL;
+	struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL;
+	struct inode *old_inode, *new_inode;
+	struct timespec64 ts = current_time(old_dir);
+	loff_t old_i_pos, new_i_pos;
+	int err, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_inode = d_inode(old_dentry);
+	new_inode = d_inode(new_dentry);
+
+	/* Acquire super block lock for the operation to be atomic */
+	mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+	/* if directories are not the same, get ".." info to update */
+	if (old_dir != new_dir) {
+		err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh,
+					 &old_dotdot_de);
+		if (err)
+			goto out;
+
+		err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh,
+					 &new_dotdot_de);
+		if (err)
+			goto out;
+	}
+
+	old_i_pos = MSDOS_I(old_inode)->i_pos;
+	new_i_pos = MSDOS_I(new_inode)->i_pos;
+
+	vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos);
+
+	err = vfat_sync_ipos(old_dir, new_inode);
+	if (err)
+		goto error_exchange;
+	err = vfat_sync_ipos(new_dir, old_inode);
+	if (err)
+		goto error_exchange;
+
+	/* update ".." directory entry info */
+	if (old_dotdot_de) {
+		err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh,
+					    old_dotdot_de);
+		if (err)
+			goto error_old_dotdot;
+	}
+	if (new_dotdot_de) {
+		err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh,
+					    new_dotdot_de);
+		if (err)
+			goto error_new_dotdot;
+	}
+
+	/* if cross directory and only one is a directory, adjust nlink */
+	if (!old_dotdot_de != !new_dotdot_de) {
+		if (old_dotdot_de)
+			vfat_move_nlink(old_dir, new_dir);
+		else
+			vfat_move_nlink(new_dir, old_dir);
+	}
+
+	vfat_update_dir_metadata(old_dir, &ts);
+	/* if directories are not the same, update new_dir as well */
+	if (old_dir != new_dir)
+		vfat_update_dir_metadata(new_dir, &ts);
+
+out:
+	brelse(old_dotdot_bh);
+	brelse(new_dotdot_bh);
+	mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+	return err;
+
+error_new_dotdot:
+	if (new_dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(new_dir, new_inode,
+						 new_dotdot_bh, new_dotdot_de);
+	}
+
+error_old_dotdot:
+	if (old_dotdot_de) {
+		corrupt |= vfat_update_dotdot_de(old_dir, old_inode,
+						 old_dotdot_bh, old_dotdot_de);
+	}
+
+error_exchange:
+	vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos);
+	corrupt |= vfat_sync_ipos(new_dir, new_inode);
+	corrupt |= vfat_sync_ipos(old_dir, old_inode);
+
+	if (corrupt < 0) {
+		fat_fs_error(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld, %lld)",
+			     __func__, old_i_pos, new_i_pos);
+	}
+	goto out;
+}
+
 static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
 			struct dentry *old_dentry, struct inode *new_dir,
 			struct dentry *new_dentry, unsigned int flags)
 {
-	if (flags & ~RENAME_NOREPLACE)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 		return -EINVAL;
 
+	if (flags & RENAME_EXCHANGE) {
+		return vfat_rename_exchange(old_dir, old_dentry,
+					    new_dir, new_dentry);
+	}
+
 	/* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
 	return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
-- 
cgit v1.2.3


From 376b0c266143a1dda162db6d5bc9b3a7f0ae97c9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Jun 2022 14:22:06 +0300
Subject: proc: delete unused <linux/uaccess.h> includes

Those aren't necessary after seq files won.

Link: https://lkml.kernel.org/r/YqnA3mS7KBt8Z4If@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c    | 1 -
 fs/proc/inode.c    | 2 --
 fs/proc/kmsg.c     | 1 -
 fs/proc/nommu.c    | 1 -
 fs/proc/proc_net.c | 3 ---
 fs/proc/proc_tty.c | 2 --
 fs/proc/root.c     | 3 ---
 fs/proc/vmcore.c   | 1 -
 8 files changed, 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index eb815759842c..65fa603422e0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -69,7 +69,6 @@
 #include <linux/sched/cputime.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
-#include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 73aeb4e6d32e..fd40d60169b5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,8 +26,6 @@
 #include <linux/mount.h>
 #include <linux/bug.h>
 
-#include <linux/uaccess.h>
-
 #include "internal.h"
 
 static void proc_evict_inode(struct inode *inode)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887f..592e6dc7c110 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,7 +15,6 @@
 #include <linux/fs.h>
 #include <linux/syslog.h>
 
-#include <linux/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..4d3493579458 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
-#include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/div64.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 913e5acefbb6..bbce6fbe779c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
  *
  *  proc net directory handling functions
  */
-
-#include <linux/uaccess.h>
-
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
  *
  * Copyright 1997, Theodore Ts'o
  */
-
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..5a7d15d197f8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
  *
  *  proc root directory handling functions
  */
-
-#include <linux/uaccess.h>
-
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4eaeb645e759..f2aa86c421f2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,7 +25,6 @@
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
-#include <linux/uaccess.h>
 #include <linux/uio.h>
 #include <linux/cc_platform.h>
 #include <asm/io.h>
-- 
cgit v1.2.3


From a16ceb13961068f7209e34d7984f8e42d2c06159 Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Wed, 15 Jun 2022 14:24:23 -0700
Subject: epoll: autoremove wakers even more aggressively

If a process is killed or otherwise exits while having active network
connections and many threads waiting on epoll_wait, the threads will all
be woken immediately, but not removed from ep->wq.  Then when network
traffic scans ep->wq in wake_up, every wakeup attempt will fail, and will
not remove the entries from the list.

This means that the cost of the wakeup attempt is far higher than usual,
does not decrease, and this also competes with the dying threads trying to
actually make progress and remove themselves from the wq.

Handle this by removing visited epoll wq entries unconditionally, rather
than only when the wakeup succeeds - the structure of ep_poll means that
the only potential loss is the timed_out->eavail heuristic, which now can
race and result in a redundant ep_send_events attempt.  (But only when
incoming data and a timeout actually race, not on every timeout)

Shakeel added:

: We are seeing this issue in production with real workloads and it has
: caused hard lockups.  Particularly network heavy workloads with a lot
: of threads in epoll_wait() can easily trigger this issue if they get
: killed (oom-killed in our case).

Link: https://lkml.kernel.org/r/xm26fsjotqda.fsf@google.com
Signed-off-by: Ben Segall <bsegall@google.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Roman Penyaev <rpenyaev@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Khazhismel Kumykov <khazhy@google.com>
Cc: Heiher <r@hev.cc>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/eventpoll.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e2daa940ebce..8b56b94e2f56 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
 	return to;
 }
 
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+				       unsigned int mode, int sync, void *key)
+{
+	int ret = default_wake_function(wq_entry, mode, sync, key);
+
+	list_del_init(&wq_entry->entry);
+	return ret;
+}
+
 /**
  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
  *           event buffer.
@@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * normal wakeup path no need to call __remove_wait_queue()
 		 * explicitly, thus ep->lock is not taken, which halts the
 		 * event delivery.
+		 *
+		 * In fact, we now use an even more aggressive function that
+		 * unconditionally removes, because we don't reuse the wait
+		 * entry between loop iterations. This lets us also avoid the
+		 * performance issue if a process is killed, causing all of its
+		 * threads to wake up without being removed normally.
 		 */
 		init_wait(&wait);
+		wait.func = ep_autoremove_wake_function;
 
 		write_lock_irq(&ep->lock);
 		/*
-- 
cgit v1.2.3


From f71381fcdc3ab615f55278d435a9f35542dc9e63 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:01 +0800
Subject: autofs: use inode permission method for write access

Patch series "autofs: misc patches".

This series contains several patches that resulted mostly from comments
made by Al Viro (quite a long time ago now).


This patch (of 5):

Eliminate some code duplication from mkdir/rmdir/symlink/unlink methods by
using the inode operation .permission().

Link: https://lkml.kernel.org/r/165724445154.30914.10970894936827635879.stgit@donald.themaw.net
Link: https://lkml.kernel.org/r/165724458096.30914.13499431569758625806.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/root.c | 63 ++++++++++++++++++++------------------------------------
 1 file changed, 22 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 91fe4548c256..fef6ed991022 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
 
 #include "autofs_i.h"
 
+static int autofs_dir_permission(struct user_namespace *, struct inode *, int);
 static int autofs_dir_symlink(struct user_namespace *, struct inode *,
 			      struct dentry *, const char *);
 static int autofs_dir_unlink(struct inode *, struct dentry *);
@@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = {
 
 const struct inode_operations autofs_dir_inode_operations = {
 	.lookup		= autofs_lookup,
+	.permission	= autofs_dir_permission,
 	.unlink		= autofs_dir_unlink,
 	.symlink	= autofs_dir_symlink,
 	.mkdir		= autofs_dir_mkdir,
@@ -526,11 +528,30 @@ static struct dentry *autofs_lookup(struct inode *dir,
 	return NULL;
 }
 
+static int autofs_dir_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask)
+{
+	if (mask & MAY_WRITE) {
+		struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+
+		if (!autofs_oz_mode(sbi))
+			return -EACCES;
+
+		/* autofs_oz_mode() needs to allow path walks when the
+		 * autofs mount is catatonic but the state of an autofs
+		 * file system needs to be preserved over restarts.
+		 */
+		if (sbi->flags & AUTOFS_SBI_CATATONIC)
+			return -EACCES;
+	}
+
+	return generic_permission(mnt_userns, inode, mask);
+}
+
 static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 			      struct inode *dir, struct dentry *dentry,
 			      const char *symname)
 {
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 	struct inode *inode;
@@ -539,16 +560,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 
 	pr_debug("%s <- %pd\n", symname, dentry);
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	BUG_ON(!ino);
 
 	autofs_clean_ino(ino);
@@ -601,16 +612,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
@@ -683,16 +684,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 
 	pr_debug("dentry %p, removing %pd\n", dentry, dentry);
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	if (ino->count != 1)
 		return -ENOTEMPTY;
 
@@ -726,16 +717,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
 	struct autofs_info *p_ino;
 	struct inode *inode;
 
-	if (!autofs_oz_mode(sbi))
-		return -EACCES;
-
-	/* autofs_oz_mode() needs to allow path walks when the
-	 * autofs mount is catatonic but the state of an autofs
-	 * file system needs to be preserved over restarts.
-	 */
-	if (sbi->flags & AUTOFS_SBI_CATATONIC)
-		return -EACCES;
-
 	pr_debug("dentry %p, creating %pd\n", dentry, dentry);
 
 	BUG_ON(!ino);
-- 
cgit v1.2.3


From 9ccbac76e71de411b9c4beea9d91ba98f3fad690 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:06 +0800
Subject: autofs: make dentry info count consistent

If an autofs dentry is a mount root directory there's no ->mkdir() call to
set its count to one.

To make the dentry info count consistent for all autofs dentries set count
to one when the dentry info struct is allocated.

Link: https://lkml.kernel.org/r/165724458671.30914.2902424437132835325.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/inode.c | 1 +
 fs/autofs/root.c  | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 9edf243713eb..affa70360b1f 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
 		INIT_LIST_HEAD(&ino->expiring);
 		ino->last_used = jiffies;
 		ino->sbi = sbi;
+		ino->count = 1;
 	}
 	return ino;
 }
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index fef6ed991022..442d27d9cb1b 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -582,7 +582,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
 	d_add(dentry, inode);
 
 	dget(dentry);
-	ino->count++;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 
@@ -612,7 +611,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
-	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
 	dput(ino->dentry);
@@ -695,7 +693,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (sbi->version < 5)
 		autofs_clear_leaf_automount_flags(dentry);
 
-	ino->count--;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count--;
 	dput(ino->dentry);
@@ -734,7 +731,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
 		autofs_set_leaf_automount_flags(dentry);
 
 	dget(dentry);
-	ino->count++;
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 	inc_nlink(dir);
-- 
cgit v1.2.3


From a4a87303874c1a7d49cc18a8fe33676b0002ffbf Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:12 +0800
Subject: autofs: use dentry info count instead of simple_empty()

The dentry info.  field count is used to check if a dentry is in use
during expire.  But, to be used for this the count field must account for
the presence of child dentries in a directory dentry.

Therefore it can also be used to check for an empty directory dentry which
can be done without having to to take an additional lock or account for
the presence of a readdir cursor dentry as is done by simple_empty().

Link: https://lkml.kernel.org/r/165724459238.30914.1504611159945950108.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/autofs_i.h |  5 +++++
 fs/autofs/expire.c   |  2 +-
 fs/autofs/root.c     | 18 ++++++++----------
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 918826eaceea..0117d6e06300 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -148,6 +148,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
 		 task_pgrp(current) == sbi->oz_pgrp);
 }
 
+static inline bool autofs_empty(struct autofs_info *ino)
+{
+	return ino->count < 2;
+}
+
 struct inode *autofs_get_inode(struct super_block *, umode_t);
 void autofs_free_ino(struct autofs_info *);
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b3fefd6237c3..038b3d2d9f57 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 		return NULL;
 	}
 
-	if (simple_empty(dentry))
+	if (autofs_empty(ino))
 		return NULL;
 
 	/* Case 2: tree mount, expire iff entire tree is not busy */
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 442d27d9cb1b..e0fa71eb5c05 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -79,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs_dentry_ino(dentry);
 
 	pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
 
@@ -95,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&sbi->lookup_lock);
-	if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
+	if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) {
 		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 	}
@@ -364,7 +365,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 		 * the mount never trigger mounts themselves (they have an
 		 * autofs trigger mount mounted on them). But v4 pseudo direct
 		 * mounts do need the leaves to trigger mounts. In this case
-		 * we have no choice but to use the list_empty() check and
+		 * we have no choice but to use the autofs_empty() check and
 		 * require user space behave.
 		 */
 		if (sbi->version > 4) {
@@ -373,7 +374,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
 				goto done;
 			}
 		} else {
-			if (!simple_empty(dentry)) {
+			if (!autofs_empty(ino)) {
 				spin_unlock(&sbi->fs_lock);
 				goto done;
 			}
@@ -428,9 +429,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 
 	if (rcu_walk) {
 		/* We don't need fs_lock in rcu_walk mode,
-		 * just testing 'AUTOFS_INFO_NO_RCU' is enough.
-		 * simple_empty() takes a spinlock, so leave it
-		 * to last.
+		 * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough.
+		 *
 		 * We only return -EISDIR when certain this isn't
 		 * a mount-trap.
 		 */
@@ -443,9 +443,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 		inode = d_inode_rcu(dentry);
 		if (inode && S_ISLNK(inode->i_mode))
 			return -EISDIR;
-		if (list_empty(&dentry->d_subdirs))
-			return 0;
-		if (!simple_empty(dentry))
+		if (!autofs_empty(ino))
 			return -EISDIR;
 		return 0;
 	}
@@ -465,7 +463,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
 		 * we can avoid needless calls ->d_automount() and avoid
 		 * an incorrect ELOOP error return.
 		 */
-		if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
+		if ((!path_is_mountpoint(path) && !autofs_empty(ino)) ||
 		    (d_really_is_positive(dentry) && d_is_symlink(dentry)))
 			status = -EISDIR;
 	}
-- 
cgit v1.2.3


From ba97a0a3a31a2451607ebf601c0b7c4b1322ce9a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:18 +0800
Subject: autofs: add comment about autofs_mountpoint_changed()

The function autofs_mountpoint_changed() is unusual, add a comment about
two cases for which it is needed.

Link: https://lkml.kernel.org/r/165724459804.30914.10974834416046555127.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/root.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index e0fa71eb5c05..ca03c1cae2be 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -291,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path)
 	struct dentry *dentry = path->dentry;
 	struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
 
-	/*
-	 * If this is an indirect mount the dentry could have gone away
-	 * as a result of an expire and a new one created.
+	/* If this is an indirect mount the dentry could have gone away
+	 * and a new one created.
+	 *
+	 * This is unusual and I can't remember the case for which it
+	 * was originally added now. But an example of how this can
+	 * happen is an autofs indirect mount that has the "browse"
+	 * option set and also has the "symlink" option in the autofs
+	 * map entry. In this case the daemon will remove the browse
+	 * directory and create a symlink as the mount leaving the
+	 * struct path stale.
+	 *
+	 * Another not so obvious case is when a mount in an autofs
+	 * indirect mount that uses the "nobrowse" option is being
+	 * expired at the same time as a path walk. If the mount has
+	 * been umounted but the mount point directory seen before
+	 * becoming unhashed (during a lockless path walk) when a stat
+	 * family system call is made the mount won't be re-mounted as
+	 * it should. In this case the mount point that's been removed
+	 * (by the daemon) will be stale and the a new mount point
+	 * dentry created.
 	 */
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;
-- 
cgit v1.2.3


From 7ffe4e90a061a2f612b3b8c29b583ec3b707781f Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Jul 2022 09:43:23 +0800
Subject: autofs: remove unused ino field inode

Remove the unused inode field of the autofs dentry info structure.

Link: https://lkml.kernel.org/r/165724460393.30914.6511330213821246793.stgit@donald.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs/autofs_i.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 0117d6e06300..d5a44fa88acf 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type;
  */
 struct autofs_info {
 	struct dentry	*dentry;
-	struct inode	*inode;
-
 	int		flags;
 
 	struct completion expire_complete;
-- 
cgit v1.2.3


From d919a1e79bac890421537cf02ae773007bf55e6b Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Jul 2022 21:00:29 +0800
Subject: proc: fix a dentry lock race between release_task and lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc")
moved proc_flush_task() behind __exit_signal().  Then, process systemd can
take long period high cpu usage during releasing task in following
concurrent processes:

  systemd                                 ps
kernel_waitid                 stat(/proc/tgid)
  do_wait                       filename_lookup
    wait_consider_task            lookup_fast
      release_task
        __exit_signal
          __unhash_process
            detach_pid
              __change_pid // remove task->pid_links
                                     d_revalidate -> pid_revalidate  // 0
                                     d_invalidate(/proc/tgid)
                                       shrink_dcache_parent(/proc/tgid)
                                         d_walk(/proc/tgid)
                                           spin_lock_nested(/proc/tgid/fd)
                                           // iterating opened fd
        proc_flush_pid                                    |
           d_invalidate (/proc/tgid/fd)                   |
              shrink_dcache_parent(/proc/tgid/fd)         |
                shrink_dentry_list(subdirs)               ↓
                  shrink_lock_dentry(/proc/tgid/fd) --> race on dentry lock

Function d_invalidate() will remove dentry from hash firstly, but why does
proc_flush_pid() process dentry '/proc/tgid/fd' before dentry
'/proc/tgid'?  That's because proc_pid_make_inode() adds proc inode in
reverse order by invoking hlist_add_head_rcu().  But proc should not add
any inodes under '/proc/tgid' except '/proc/tgid/task/pid', fix it by
adding inode into 'pid->inodes' only if the inode is /proc/tgid or
/proc/tgid/task/pid.

Performance regression:
Create 200 tasks, each task open one file for 50,000 times. Kill all
tasks when opened files exceed 10,000,000 (cat /proc/sys/fs/file-nr).

Before fix:
$ time killall -wq aa
  real    4m40.946s   # During this period, we can see 'ps' and 'systemd'
			taking high cpu usage.

After fix:
$ time killall -wq aa
  real    1m20.732s   # During this period, we can see 'systemd' taking
			high cpu usage.

Link: https://lkml.kernel.org/r/20220713130029.4133533-1-chengzhihao1@huawei.com
Fixes: 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216054
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Suggested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c | 46 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8dfa36a99c74..93f7e3d971e4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei)
 	put_pid(pid);
 }
 
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
@@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 
 	/* Let the pid remember us for quick removal */
 	ei->pid = pid;
-	if (S_ISDIR(mode)) {
-		spin_lock(&pid->lock);
-		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
-		spin_unlock(&pid->lock);
-	}
 
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
@@ -1931,6 +1926,39 @@ out_unlock:
 	return NULL;
 }
 
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+				struct task_struct *task, umode_t mode)
+{
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct pid *pid;
+
+	inode = proc_pid_make_inode(sb, task, mode);
+	if (!inode)
+		return NULL;
+
+	/* Let proc_flush_pid find this directory inode */
+	ei = PROC_I(inode);
+	pid = ei->pid;
+	spin_lock(&pid->lock);
+	hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+	spin_unlock(&pid->lock);
+
+	return inode;
+}
+
 int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
@@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
 {
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_base_inode(dentry->d_sb, task,
+					 S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
@@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
 	struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = proc_pid_make_base_inode(dentry->d_sb, task,
+					 S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
-- 
cgit v1.2.3


From 233eb8d6894ed3349e9971a51dd8a9b5586e2598 Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Fri, 15 Jul 2022 14:00:35 +0800
Subject: fs/ocfs2: Fix spelling typo in comment

Fix spelling typo in comment.

Link: https://lkml.kernel.org/r/20220715060035.632903-1-13667453960@163.com
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Reported-by: k2ci <kernel-bot@kylinos.cn>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0b6f551a342a..dc9f76ab7e13 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -412,7 +412,7 @@ out_unlock:
 	goto out_err;
 }
 
-/* Write information to global quota file. Expects exlusive lock on quota
+/* Write information to global quota file. Expects exclusive lock on quota
  * file inode and quota info */
 static int __ocfs2_global_write_info(struct super_block *sb, int type)
 {
-- 
cgit v1.2.3


From 0c12185728d602c27cd12a845249e7f37197f71f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Fri, 17 Jun 2022 16:38:09 +0800
Subject: Revert "squashfs: provide backing_dev_info in order to disable
 read-ahead"

Patch series "Implement readahead for squashfs", v7.

Commit 9eec1d897139("squashfs: provide backing_dev_info in order to
disable read-ahead") mitigates the performance drop issue for squashfs by
closing readahead for it.

This series implements readahead callback for squashfs.


This patch (of 4):

This reverts 9eec1d897139e5 ("squashfs: provide backing_dev_info in order
to disable read-ahead").

Revert closing the readahead to squashfs since the readahead callback for
squashfs is implemented.

Link: https://lkml.kernel.org/r/20220617083810.337573-1-hsinyi@chromium.org
Link: https://lkml.kernel.org/r/20220617083810.337573-2-hsinyi@chromium.org
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Suggested-by: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Hou Tao <houtao1@huawei.com>
Cc: Miao Xie <miaoxie@huawei.com>

Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/super.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6d594ba2ed28..32565dafa7f3 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -29,7 +29,6 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/xattr.h>
-#include <linux/backing-dev.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
 	return decompressor;
 }
 
-static int squashfs_bdi_init(struct super_block *sb)
-{
-	int err;
-	unsigned int major = MAJOR(sb->s_dev);
-	unsigned int minor = MINOR(sb->s_dev);
-
-	bdi_put(sb->s_bdi);
-	sb->s_bdi = &noop_backing_dev_info;
-
-	err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
-	if (err)
-		return err;
-
-	sb->s_bdi->ra_pages = 0;
-	sb->s_bdi->io_pages = 0;
-
-	return 0;
-}
 
 static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
@@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	TRACE("Entered squashfs_fill_superblock\n");
 
-	/*
-	 * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
-	 * squashfs, I/O is not deferred, it is done immediately in read_folio,
-	 * which means the user would always have to wait their own I/O. So the effect
-	 * of readahead is very weak for squashfs. squashfs_bdi_init will set
-	 * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
-	 * squashfs.
-	 */
-	err = squashfs_bdi_init(sb);
-	if (err) {
-		errorf(fc, "squashfs init bdi failed");
-		return err;
-	}
-
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");
-- 
cgit v1.2.3


From db98b43086275350294f5c6f797249b714d6316d Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Fri, 17 Jun 2022 16:38:11 +0800
Subject: squashfs: always build "file direct" version of page actor

Squashfs_readahead uses the "file direct" version of the page actor, and
so build it unconditionally.

Link: https://lkml.kernel.org/r/20220617083810.337573-3-hsinyi@chromium.org
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Hou Tao <houtao1@huawei.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/Makefile     |  4 ++--
 fs/squashfs/page_actor.h | 46 ----------------------------------------------
 2 files changed, 2 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7bd9b8b856d0..477c89a519ee 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,9 +5,9 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
 squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 37523c54256f..24841d28bc0f 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -6,51 +6,6 @@
  * Phillip Lougher <phillip@squashfs.org.uk>
  */
 
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
-struct squashfs_page_actor {
-	void	**page;
-	int	pages;
-	int	length;
-	int	next_page;
-};
-
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
-	int pages, int length)
-{
-	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
-	if (actor == NULL)
-		return NULL;
-
-	actor->length = length ? : pages * PAGE_SIZE;
-	actor->page = page;
-	actor->pages = pages;
-	actor->next_page = 0;
-	return actor;
-}
-
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
-	actor->next_page = 1;
-	return actor->page[0];
-}
-
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
-	return actor->next_page == actor->pages ? NULL :
-		actor->page[actor->next_page++];
-}
-
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-
-static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-#else
 struct squashfs_page_actor {
 	union {
 		void		**buffer;
@@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
 	actor->alloc_buffer = 0;
 }
 #endif
-#endif
-- 
cgit v1.2.3


From 8fc78b6fe24c36b151ac98d7546591ed92083d4f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Fri, 17 Jun 2022 16:38:13 +0800
Subject: squashfs: implement readahead

Implement readahead callback for squashfs.  It will read datablocks which
cover pages in readahead request.  For a few cases it will not mark page
as uptodate, including:

- file end is 0.
- zero filled blocks.
- current batch of pages isn't in the same datablock.
- decompressor error.

Otherwise pages will be marked as uptodate.  The unhandled pages will be
updated by readpage later.

Link: https://lkml.kernel.org/r/20220617083810.337573-4-hsinyi@chromium.org
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Phillip Lougher <phillip@squashfs.org.uk>
Reported-by: Xiongwei Song <Xiongwei.Song@windriver.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Hou Tao <houtao1@huawei.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a8e495d8eb86..128ebe9aded8 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -39,6 +39,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * Locate cache slot in range [offset, index] for specified inode.  If
@@ -495,7 +496,96 @@ out:
 	return 0;
 }
 
+static void squashfs_readahead(struct readahead_control *ractl)
+{
+	struct inode *inode = ractl->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	size_t mask = (1UL << msblk->block_log) - 1;
+	unsigned short shift = msblk->block_log - PAGE_SHIFT;
+	loff_t start = readahead_pos(ractl) & ~mask;
+	size_t len = readahead_length(ractl) + readahead_pos(ractl) - start;
+	struct squashfs_page_actor *actor;
+	unsigned int nr_pages = 0;
+	struct page **pages;
+	int i, file_end = i_size_read(inode) >> msblk->block_log;
+	unsigned int max_pages = 1UL << shift;
+
+	readahead_expand(ractl, start, (len | mask) + 1);
+
+	if (file_end == 0)
+		return;
+
+	pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
+	if (!pages)
+		return;
+
+	for (;;) {
+		pgoff_t index;
+		int res, bsize;
+		u64 block = 0;
+		unsigned int expected;
+
+		nr_pages = __readahead_batch(ractl, pages, max_pages);
+		if (!nr_pages)
+			break;
+
+		if (readahead_pos(ractl) >= i_size_read(inode))
+			goto skip_pages;
+
+		index = pages[0]->index >> shift;
+		if ((pages[nr_pages - 1]->index >> shift) != index)
+			goto skip_pages;
+
+		expected = index == file_end ?
+			   (i_size_read(inode) & (msblk->block_size - 1)) :
+			    msblk->block_size;
+
+		bsize = read_blocklist(inode, index, &block);
+		if (bsize == 0)
+			goto skip_pages;
+
+		actor = squashfs_page_actor_init_special(msblk, pages, nr_pages,
+							 expected);
+		if (!actor)
+			goto skip_pages;
+
+		res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+		kfree(actor);
+
+		if (res == expected) {
+			int bytes;
+
+			/* Last page (if present) may have trailing bytes not filled */
+			bytes = res % PAGE_SIZE;
+			if (pages[nr_pages - 1]->index == file_end && bytes)
+				memzero_page(pages[nr_pages - 1], bytes,
+					     PAGE_SIZE - bytes);
+
+			for (i = 0; i < nr_pages; i++) {
+				flush_dcache_page(pages[i]);
+				SetPageUptodate(pages[i]);
+			}
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			unlock_page(pages[i]);
+			put_page(pages[i]);
+		}
+	}
+
+	kfree(pages);
+	return;
+
+skip_pages:
+	for (i = 0; i < nr_pages; i++) {
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+	kfree(pages);
+}
 
 const struct address_space_operations squashfs_aops = {
-	.read_folio = squashfs_read_folio
+	.read_folio = squashfs_read_folio,
+	.readahead = squashfs_readahead
 };
-- 
cgit v1.2.3


From b09a7a036d2035b14636cd4c4c69518d73770f65 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Fri, 17 Jun 2022 16:38:15 +0800
Subject: squashfs: support reading fragments in readahead call

Add a function which can be used to read fragments in the readahead call.

This function is necessary because filesystems built with the -tailends
(or -always-use-fragments) option may have fragments present which cannot
be currently handled.

Link: https://lkml.kernel.org/r/20220617083810.337573-5-hsinyi@chromium.org
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Hou Tao <houtao1@huawei.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Xiongwei Song <Xiongwei.Song@windriver.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Zheng Liang <zhengliang6@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 128ebe9aded8..7ff0b03cceab 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -496,6 +496,41 @@ out:
 	return 0;
 }
 
+static int squashfs_readahead_fragment(struct page **page,
+	unsigned int pages, unsigned int expected)
+{
+	struct inode *inode = page[0]->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+		squashfs_i(inode)->fragment_block,
+		squashfs_i(inode)->fragment_size);
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
+
+	if (buffer->error)
+		goto out;
+
+	expected += squashfs_i(inode)->fragment_offset;
+
+	for (n = 0; n < pages; n++) {
+		unsigned int base = (page[n]->index & mask) << PAGE_SHIFT;
+		unsigned int offset = base + squashfs_i(inode)->fragment_offset;
+
+		if (expected > offset) {
+			unsigned int avail = min_t(unsigned int, expected -
+				offset, PAGE_SIZE);
+
+			squashfs_fill_page(page[n], buffer, offset, avail);
+		}
+
+		unlock_page(page[n]);
+		put_page(page[n]);
+	}
+
+out:
+	squashfs_cache_put(buffer);
+	return buffer->error;
+}
+
 static void squashfs_readahead(struct readahead_control *ractl)
 {
 	struct inode *inode = ractl->mapping->host;
@@ -512,9 +547,6 @@ static void squashfs_readahead(struct readahead_control *ractl)
 
 	readahead_expand(ractl, start, (len | mask) + 1);
 
-	if (file_end == 0)
-		return;
-
 	pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
 	if (!pages)
 		return;
@@ -540,6 +572,15 @@ static void squashfs_readahead(struct readahead_control *ractl)
 			   (i_size_read(inode) & (msblk->block_size - 1)) :
 			    msblk->block_size;
 
+		if (index == file_end && squashfs_i(inode)->fragment_block !=
+						SQUASHFS_INVALID_BLK) {
+			res = squashfs_readahead_fragment(pages, nr_pages,
+							  expected);
+			if (res)
+				goto skip_pages;
+			continue;
+		}
+
 		bsize = read_blocklist(inode, index, &block);
 		if (bsize == 0)
 			goto skip_pages;
-- 
cgit v1.2.3


From ed8fb78d7ecdeb3e2e86df0027e2c2cc55f9908b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 23 Jul 2022 20:09:07 +0300
Subject: proc: add some (hopefully) insightful comments

* /proc/${pid}/net status
* removing PDE vs last close stuff (again!)
* random small stuff

Link: https://lkml.kernel.org/r/YtwrM6sDC0OQ53YB@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c    |  4 ++++
 fs/proc/inode.c    | 17 ++++++++++++-----
 fs/proc/proc_net.c |  6 ++++++
 fs/proc/root.c     |  5 +++++
 4 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65fa603422e0..99fcbfda8e25 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 {
 	char tcomm[64];
 
+	/*
+	 * Test before PF_KTHREAD because all workqueue worker threads are
+	 * kernel threads.
+	 */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_comm(tcomm, sizeof(tcomm), p);
 	else if (p->flags & PF_KTHREAD)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fd40d60169b5..f130499ad843 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -212,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
 		complete(pde->pde_unload_completion);
 }
 
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 	__releases(&pde->pde_unload_lock)
 {
@@ -222,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 	 *
 	 * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
 	 * "struct file" needs to be available at the right moment.
-	 *
-	 * Therefore, first process to enter this function does ->release() and
-	 * signals its completion to the other process which does nothing.
 	 */
 	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
@@ -238,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 
 		pdeo->closing = true;
 		spin_unlock(&pde->pde_unload_lock);
+
 		file = pdeo->file;
 		pde->proc_ops->proc_release(file_inode(file), file);
+
 		spin_lock(&pde->pde_unload_lock);
-		/* After ->release. */
+		/* Strictly after ->proc_release, see above. */
 		list_del(&pdeo->lh);
 		c = pdeo->c;
 		spin_unlock(&pde->pde_unload_lock);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index bbce6fbe779c..856839b8ae8b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -350,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net)
 	kgid_t gid;
 	int err;
 
+	/*
+	 * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+	 * Corresponding inode (PDE(inode) == net->proc_net) is never
+	 * instantiated therefore blanket zeroing is fine.
+	 * net->proc_net_stat inode is instantiated normally.
+	 */
 	err = -ENOMEM;
 	netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
 	if (!netd)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5a7d15d197f8..3c2ee3eb1138 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -302,6 +302,11 @@ void __init proc_root_init(void)
 	proc_mkdir("bus", NULL);
 	proc_sys_init();
 
+	/*
+	 * Last things last. It is not like userspace processes eager
+	 * to open /proc files exist at this point but register last
+	 * anyway.
+	 */
 	register_filesystem(&proc_fs_type);
 }
 
-- 
cgit v1.2.3


From 97d3b2676fc6bc4865eb825037f4492f0fb804eb Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:25 +0200
Subject: ocfs2: remove some useless functions

Patch series "ocfs2: A few clean_ups", v2.

__ocfs2_node_map_set_bit() and __ocfs2_node_map_clear_bit() are just
wrapper around set_bit() and clear_bit().

The leading __ also makes think that these functions are non-atomic just
like __set_bit() and __clear_bit().

So, just remove these wrappers and call set_bit() and clear_bit()
directly.

Link: https://lkml.kernel.org/r/cover.1658436259.git.christophe.jaillet@wanadoo.fr
Link: https://lkml.kernel.org/r/bd1429c84ec7d174c96dbb67a2b42b1b456d9394.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 9099d8fc7599..1d72e0788943 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -24,11 +24,6 @@
 
 #include "buffer_head_io.h"
 
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
-					    int bit);
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
-					      int bit);
-
 /* special case -1 for now
  * TODO: should *really* make sure the calling func never passes -1!!  */
 static void ocfs2_node_map_init(struct ocfs2_node_map *map)
@@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data)
 	ocfs2_recovery_thread(osb, node_num);
 }
 
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
-					    int bit)
-{
-	set_bit(bit, map->map);
-}
-
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
 			    struct ocfs2_node_map *map,
 			    int bit)
@@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
 		return;
 	BUG_ON(bit >= map->num_nodes);
 	spin_lock(&osb->node_map_lock);
-	__ocfs2_node_map_set_bit(map, bit);
+	set_bit(bit, map->map);
 	spin_unlock(&osb->node_map_lock);
 }
 
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
-					      int bit)
-{
-	clear_bit(bit, map->map);
-}
-
 void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 			      struct ocfs2_node_map *map,
 			      int bit)
@@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 		return;
 	BUG_ON(bit >= map->num_nodes);
 	spin_lock(&osb->node_map_lock);
-	__ocfs2_node_map_clear_bit(map, bit);
+	clear_bit(bit, map->map);
 	spin_unlock(&osb->node_map_lock);
 }
 
-- 
cgit v1.2.3


From 702f3cf374b85d2e77431c80e870ee31ea03cdd8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:37 +0200
Subject: ocfs2: use the bitmap API to simplify code

Use bitmap_zero() instead of hand-writing it.  It is less verbose.

While at it, add an explicit #include <linux/bitmap.h>.

Link: https://lkml.kernel.org/r/86d2a027c319db12055c98f00c65f7d01e703722.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1d72e0788943..dd29d60af154 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
+#include <linux/bitmap.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/highmem.h>
@@ -29,8 +30,7 @@
 static void ocfs2_node_map_init(struct ocfs2_node_map *map)
 {
 	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
-	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
-	       sizeof(unsigned long));
+	bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES);
 }
 
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
-- 
cgit v1.2.3


From 45ee6d1e935d879d86aebd1fd15afb3bc015c4a0 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 21 Jul 2022 22:49:48 +0200
Subject: ocfs2: fix a typo in a comment

s/heartbaet/heartbeat

Link: https://lkml.kernel.org/r/4d4a6786e8ad522bfad6d2401b7f6634f8af0e5d.1658436259.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/heartbeat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index dd29d60af154..22da768e65b7 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -2,7 +2,7 @@
 /*
  * heartbeat.c
  *
- * Register ourselves with the heartbaet service, keep our node maps
+ * Register ourselves with the heartbeat service, keep our node maps
  * up to date, and fire off recovery when needed.
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
-- 
cgit v1.2.3