From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:20:50 +0100 Subject: bpf: Fix -Wshadow warnings There are thousands of warnings about one macro in a W=2 build: include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow] Prefix all the locals in that macro with __ to avoid most of these warnings. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org --- include/linux/filter.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578..1b62397bd124 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -558,21 +558,21 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 ret; \ + u32 __ret; \ cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ + struct bpf_prog_stats *__stats; \ + u64 __start = sched_clock(); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&__stats->syncp); \ + __stats->cnt++; \ + __stats->nsecs += sched_clock() - __start; \ + u64_stats_update_end(&__stats->syncp); \ } else { \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ - ret; }) + __ret; }) #define BPF_PROG_RUN(prog, ctx) \ __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) -- cgit v1.2.3 From e5e1a4bc916d29958c3b587354293738fcb984d7 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 27 Oct 2020 13:32:01 +0100 Subject: xsk: Fix possible memory leak at socket close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible memory leak at xsk socket close that is caused by the refcounting of the umem object being wrong. The reference count of the umem was decremented only after the pool had been freed. Note that if the buffer pool is destroyed, it is important that the umem is destroyed after the pool, otherwise the umem would disappear while the driver is still running. And as the buffer pool needs to be destroyed in a work queue, the umem is also (if its refcount reaches zero) destroyed after the buffer pool in that same work queue. What was missing is that the refcount also needs to be decremented when the pool is not freed and when the pool has not even been created. The first case happens when the refcount of the pool is higher than 1, i.e. it is still being used by some other socket using the same device and queue id. In this case, it is safe to decrement the refcount of the umem outside of the work queue as the umem will never be freed because the refcount of the umem is always greater than or equal to the refcount of the buffer pool. The second case is if the buffer pool has not been created yet, i.e. the socket was closed before it was bound but after the umem was created. In this case, it is safe to destroy the umem outside of the work queue, since there is no pool that can use it by definition. Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem") Reported-by: syzbot+eb71df123dc2be2c1456@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1603801921-2712-1-git-send-email-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 3 ++- net/xdp/xsk_buff_pool.c | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 0140d086dc84..01755b838c74 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -86,7 +86,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -void xp_put_pool(struct xsk_buff_pool *pool); +bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b71a32eeae65..cfbec3989a76 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1146,7 +1146,8 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xp_put_pool(xs->pool); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 64c9e55d4d4e..8a3bf4e1318e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -251,15 +251,18 @@ void xp_get_pool(struct xsk_buff_pool *pool) refcount_inc(&pool->users); } -void xp_put_pool(struct xsk_buff_pool *pool) +bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) - return; + return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); + return true; } + + return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) -- cgit v1.2.3 From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- kernel/bpf/Makefile | 6 +++++- kernel/bpf/core.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b71..5deb37024574 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8..ac3fa37a84f9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z -- cgit v1.2.3 From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Oct 2020 08:33:09 +0200 Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single The tbl_dma_addr argument is used to check the DMA boundary for the allocations, and thus needs to be a dma_addr_t. swiotlb-xen instead passed a physical address, which could lead to incorrect results for strange offsets. Fix this by removing the parameter entirely and hard code the DMA address for io_tlb_start instead. Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/intel/iommu.c | 5 ++--- drivers/xen/swiotlb-xen.c | 3 +-- include/linux/swiotlb.h | 10 +++------- kernel/dma/swiotlb.c | 16 ++++++---------- 4 files changed, 12 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8651f6d4dfa0..6b560e6f1930 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3815,9 +3815,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, * page aligned, we don't need to use a bounce page. */ if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { - tlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, aligned_size, dir, attrs); + tlb_addr = swiotlb_tbl_map_single(dev, paddr, size, + aligned_size, dir, attrs); if (tlb_addr == DMA_MAPPING_ERROR) { goto swiotlb_error; } else { diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 71ce1b7a23d1..2b385c1b4a99 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start), - phys, size, size, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 54078f0d4c87..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -445,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -671,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From 763e4cdc0f6d5cea45c896fef67f7be4bdefcca7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:48 -0700 Subject: iomap: support partial page discard on writeback block mapping failure iomap writeback mapping failure only calls into ->discard_page() if the current page has not been added to the ioend. Accordingly, the XFS callback assumes a full page discard and invalidation. This is problematic for sub-page block size filesystems where some portion of a page might have been mapped successfully before a failure to map a delalloc block occurs. ->discard_page() is not called in that error scenario and the bio is explicitly failed by iomap via the error return from ->prepare_ioend(). As a result, the filesystem leaks delalloc blocks and corrupts the filesystem block counters. Since XFS is the only user of ->discard_page(), tweak the semantics to invoke the callback unconditionally on mapping errors and provide the file offset that failed to map. Update xfs_discard_page() to discard the corresponding portion of the file and pass the range along to iomap_invalidatepage(). The latter already properly handles both full and sub-page scenarios by not changing any iomap or page state on sub-page invalidations. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 15 ++++++++------- fs/xfs/xfs_aops.c | 14 ++++++++------ include/linux/iomap.h | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8180061b9e16..e4ea1f9f94d0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1382,14 +1382,15 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * appropriately. */ if (unlikely(error)) { + /* + * Let the filesystem know what portion of the current page + * failed to map. If the page wasn't been added to ioend, it + * won't be affected by I/O completion and we must unlock it + * now. + */ + if (wpc->ops->discard_page) + wpc->ops->discard_page(page, file_offset); if (!count) { - /* - * If the current page hasn't been added to ioend, it - * won't be affected by I/O completions and we must - * discard and unlock it right here. - */ - if (wpc->ops->discard_page) - wpc->ops->discard_page(page); ClearPageUptodate(page); unlock_page(page); goto done; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 55d126d4e096..5bf37afae5e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -527,13 +527,15 @@ xfs_prepare_ioend( */ static void xfs_discard_page( - struct page *page) + struct page *page, + loff_t fileoff) { struct inode *inode = page->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - loff_t offset = page_offset(page); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + unsigned int pageoff = offset_in_page(fileoff); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); int error; if (XFS_FORCED_SHUTDOWN(mp)) @@ -541,14 +543,14 @@ xfs_discard_page( xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, offset); + page, ip->i_ino, fileoff); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page)); + i_blocks_per_page(inode, page) - pageoff_fsb); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, 0, PAGE_SIZE); + iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); } static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 172b3397a1a3..5bd3cac4df9c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -221,7 +221,7 @@ struct iomap_writeback_ops { * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ - void (*discard_page)(struct page *page); + void (*discard_page)(struct page *page, loff_t fileoff); }; struct iomap_writepage_ctx { -- cgit v1.2.3 From fdaf083cdfb556a45c422c8998268baf1ab26829 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:37:30 -0600 Subject: io_uring: properly handle SQPOLL request cancelations Track if a given task io_uring context contains SQPOLL instances, so we can iterate those for cancelation (and request counts). This ensures that we properly wait on SQPOLL contexts, and find everything that needs canceling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 77 ++++++++++++++++++++++++++++++++++++++++-------- include/linux/io_uring.h | 3 +- 2 files changed, 67 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index a7429c977eb3..b398394a919e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1668,7 +1668,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + } else if (ctx->cq_overflow_flushed || + atomic_read(&req->task->io_uring->in_idle)) { /* * If we're in ring overflow flush mode, or in task cancel mode, * then we cannot store the request for later flushing, we need @@ -1838,7 +1839,7 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); percpu_counter_dec(&tctx->inflight); - if (tctx->in_idle) + if (atomic_read(&tctx->in_idle)) wake_up(&tctx->wait); put_task_struct(req->task); @@ -7695,7 +7696,8 @@ static int io_uring_alloc_task_context(struct task_struct *task) xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; - tctx->in_idle = 0; + atomic_set(&tctx->in_idle, 0); + tctx->sqpoll = false; io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; @@ -8598,8 +8600,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { struct task_struct *task = current; - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + io_sq_thread_park(ctx->sq_data); + } io_cqring_overflow_flush(ctx, true, task, files); @@ -8607,12 +8612,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_run_task_work(); cond_resched(); } + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + atomic_dec(&task->io_uring->in_idle); + /* + * If the files that are going away are the ones in the thread + * identity, clear them out. + */ + if (task->io_uring->identity->files == files) + task->io_uring->identity->files = NULL; + io_sq_thread_unpark(ctx->sq_data); + } } /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; @@ -8634,6 +8650,14 @@ static int io_uring_add_task_file(struct file *file) tctx->last = file; } + /* + * This is race safe in that the task itself is doing this, hence it + * cannot be going through the exit/cancel paths at the same time. + * This cannot be modified while exit/cancel is running. + */ + if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) + tctx->sqpoll = true; + return 0; } @@ -8675,7 +8699,7 @@ void __io_uring_files_cancel(struct files_struct *files) unsigned long index; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); xa_for_each(&tctx->xa, index, file) { struct io_ring_ctx *ctx = file->private_data; @@ -8684,6 +8708,35 @@ void __io_uring_files_cancel(struct files_struct *files) if (files) io_uring_del_task_file(file); } + + atomic_dec(&tctx->in_idle); +} + +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + unsigned long index; + struct file *file; + s64 inflight; + + inflight = percpu_counter_sum(&tctx->inflight); + if (!tctx->sqpoll) + return inflight; + + /* + * If we have SQPOLL rings, then we need to iterate and find them, and + * add the pending count for those. + */ + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + + inflight += percpu_counter_sum(&__tctx->inflight); + } + } + + return inflight; } /* @@ -8697,11 +8750,11 @@ void __io_uring_task_cancel(void) s64 inflight; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = percpu_counter_sum(&tctx->inflight); + inflight = tctx_inflight(tctx); if (!inflight) break; __io_uring_files_cancel(NULL); @@ -8712,13 +8765,13 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (inflight != percpu_counter_sum(&tctx->inflight)) + if (inflight != tctx_inflight(tctx)) continue; schedule(); } while (1); finish_wait(&tctx->wait, &wait); - tctx->in_idle = false; + atomic_dec(&tctx->in_idle); } static int io_uring_flush(struct file *file, void *data) @@ -8863,7 +8916,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(f.file); + ret = io_uring_add_task_file(ctx, f.file); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9092,7 +9145,7 @@ err_fd: #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif - if (unlikely(io_uring_add_task_file(file))) { + if (unlikely(io_uring_add_task_file(ctx, file))) { file = ERR_PTR(-ENOMEM); goto err_fd; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 868364cea3b7..35b2d845704d 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -30,7 +30,8 @@ struct io_uring_task { struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - bool in_idle; + atomic_t in_idle; + bool sqpoll; }; #if defined(CONFIG_IO_URING) -- cgit v1.2.3 From d321ff589c16d8c2207485a6d7fbdb14e873d46e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 23 Oct 2020 10:41:07 -0400 Subject: SUNRPC: Fix general protection fault in trace_rpc_xdr_overflow() The TP_fast_assign() section is careful enough not to dereference xdr->rqst if it's NULL. The TP_STRUCT__entry section is not. Fixes: 5582863f450c ("SUNRPC: Add XDR overflow trace event") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index f45b3c01370c..2477014e3fa6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -655,10 +655,10 @@ TRACE_EVENT(rpc_xdr_overflow, __field(size_t, tail_len) __field(unsigned int, page_len) __field(unsigned int, len) - __string(progname, - xdr->rqst->rq_task->tk_client->cl_program->name) - __string(procedure, - xdr->rqst->rq_task->tk_msg.rpc_proc->p_name) + __string(progname, xdr->rqst ? + xdr->rqst->rq_task->tk_client->cl_program->name : "unknown") + __string(procedure, xdr->rqst ? + xdr->rqst->rq_task->tk_msg.rpc_proc->p_name : "unknown") ), TP_fast_assign( -- cgit v1.2.3 From d4d50710a8b46082224376ef119a4dbb75b25c56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:33 +0100 Subject: seq_file: add seq_read_iter iov_iter based variant for reading a seq_file. seq_read is reimplemented on top of the iter variant. Signed-off-by: Christoph Hellwig Tested-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/seq_file.c | 45 ++++++++++++++++++++++++++++++++------------- include/linux/seq_file.h | 1 + 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/seq_file.c b/fs/seq_file.c index 31219c1db17d..3b20e21604e7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -146,7 +147,28 @@ Eoverflow: */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct seq_file *m = file->private_data; + struct iovec iov = { .iov_base = buf, .iov_len = size}; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + iov_iter_init(&iter, READ, &iov, 1, size); + + kiocb.ki_pos = *ppos; + ret = seq_read_iter(&kiocb, &iter); + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(seq_read); + +/* + * Ready-made ->f_op->read_iter() + */ +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct seq_file *m = iocb->ki_filp->private_data; + size_t size = iov_iter_count(iter); size_t copied = 0; size_t n; void *p; @@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ - if (*ppos == 0) { + if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } - /* Don't assume *ppos is where we left it */ - if (unlikely(*ppos != m->read_pos)) { - while ((err = traverse(m, *ppos)) == -EAGAIN) + /* Don't assume ki_pos is where we left it */ + if (unlikely(iocb->ki_pos != m->read_pos)) { + while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ @@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->count = 0; goto Done; } else { - m->read_pos = *ppos; + m->read_pos = iocb->ki_pos; } } @@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* if not empty - flush it first */ if (m->count) { n = min(m->count, size); - err = copy_to_user(buf, m->buf + m->from, n); - if (err) + if (copy_to_iter(m->buf + m->from, n, iter) != n) goto Efault; m->count -= n; m->from += n; size -= n; - buf += n; copied += n; if (!size) goto Done; @@ -254,8 +274,7 @@ Fill: } m->op->stop(m, p); n = min(m->count, size); - err = copy_to_user(buf, m->buf, n); - if (err) + if (copy_to_iter(m->buf, n, iter) != n) goto Efault; copied += n; m->count -= n; @@ -264,7 +283,7 @@ Done: if (!copied) copied = err; else { - *ppos += copied; + iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); @@ -276,7 +295,7 @@ Efault: err = -EFAULT; goto Done; } -EXPORT_SYMBOL(seq_read); +EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 813614d4b71f..b83b3ae3c877 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -107,6 +107,7 @@ void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); -- cgit v1.2.3 From b21ebf143af219207214c79bc217beb39c43212a Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:51 -0800 Subject: ext4: mark fc ineligible if inode gets evictied due to mem pressure If inode gets evicted due to memory pressure, we have to remove it from the fast commit list. However, that inode may have uncommitted changes that fast commits will lose. So, just fall back to full commits in this case. Also, rename the fast commit ineligiblity reason from "EXT4_FC_REASON_MEM" to "EXT4_FC_REASON_MEM_NOMEM" for better expression. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 4 ++-- fs/ext4/fast_commit.h | 2 +- fs/ext4/inode.c | 2 ++ include/trace/events/ext4.h | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8d43058386c3..48b7bc129392 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -384,7 +384,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -397,7 +397,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_MEM); + EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 06907d485989..140fbb6af19e 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -100,7 +100,7 @@ enum { EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, - EXT4_FC_REASON_MEM, + EXT4_FC_REASON_NOMEM, EXT4_FC_REASON_SWAP_BOOT, EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b96a18679a27..081b6a86b5db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -327,6 +327,8 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + if (!list_empty(&EXT4_I(inode)->i_fc_list)) + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b14314fcf732..0f899d3b09d3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -100,7 +100,7 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ - { EXT4_FC_REASON_MEM, "NO_MEM"}, \ + { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ @@ -2917,13 +2917,13 @@ TRACE_EVENT(ext4_fc_stats, ), TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s,%d; " + "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_MEM), + FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), -- cgit v1.2.3 From ede7dc7fa0af619afc08995776eadb9ff3b0a711 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:54 -0800 Subject: jbd2: rename j_maxlen to j_total_len and add jbd2_journal_max_txn_bufs The on-disk superblock field sb->s_maxlen represents the total size of the journal including the fast commit area and is no more the max number of blocks available for a transaction. The maximum number of blocks available to a transaction is reduced by the number of fast commit blocks. So, this patch renames j_maxlen to j_total_len to better represent its intent. Also, it adds a function to calculate max number of bufs available for a transaction. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 2 +- fs/ext4/super.c | 2 +- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 12 ++++++------ fs/jbd2/recovery.c | 6 +++--- fs/ocfs2/journal.c | 2 +- include/linux/jbd2.h | 9 +++++++-- 7 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b232c2767534..4c2a9fe30067 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; - irec.fmr_length = journal->j_maxlen; + irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 738a6dd4957d..8a6dd433bb70 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3976,7 +3976,7 @@ int ext4_calculate_overhead(struct super_block *sb) * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev) - overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fa688e163a80..ec516490cb35 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -801,7 +801,7 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < journal->j_maxlen / 4) + if (freed < jbd2_journal_get_max_txn_bufs(journal)) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c7c42bd530f..c3c768248527 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1348,7 +1348,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; - journal->j_maxlen = len; + journal->j_total_len = len; /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; @@ -1531,7 +1531,7 @@ static int journal_reset(journal_t *journal) journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; - journal->j_max_transaction_buffers = journal->j_maxlen / 4; + journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); /* * As a special case, if the on-disk copy is already marked as needing @@ -1792,15 +1792,15 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + be32_to_cpu(sb->s_first) >= journal->j_total_len) { printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index eb2606133cd8..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start) /* Do up to 128K of readahead */ max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; + if (max > journal->j_total_len) + max = journal->j_total_len; /* Do the readahead itself. We'll submit MAXBUF buffer_heads at * a time to the block device IO layer. */ @@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; - if (offset >= journal->j_maxlen) { + if (offset >= journal->j_total_len) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EFSCORRUPTED; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9a9d69dde7e..db52e843002a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_total_len); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1d5566af48ac..e0b6b53eae64 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -988,9 +988,9 @@ struct journal_s struct block_device *j_fs_dev; /** - * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_total_len: Total maximum capacity of the journal region on disk. */ - unsigned int j_maxlen; + unsigned int j_total_len; /** * @j_reserved_credits: @@ -1624,6 +1624,11 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); +static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 4; +} + /* * is_journal_abort * -- cgit v1.2.3 From a1e5e465b31d6015fccb359d99053b39e5180466 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:55 -0800 Subject: ext4: clean up the JBD2 API that initializes fast commits This patch removes jbd2_fc_init() API and its related functions to simplify enabling fast commits. With this change, the number of fast commit blocks to use is solely determined by the JBD2 layer. So, we move the default value for minimum number of fast commit blocks from ext4/fast_commit.h to include/linux/jbd2.h. However, whether or not to use fast commits is determined by the file system. The file system just sets the fast commit feature using jbd2_journal_set_features(). JBD2 layer then determines how many blocks to use for fast commits (based on the value found in the JBD2 superblock). Note that the JBD2 feature flag of fast commits is just an indication that there are fast commit blocks present on disk. It doesn't tell JBD2 layer about the intent of the file system of whether to it wants to use fast commit or not. That's why, we blindly clear the fast commit flag in journal_reset() after the recovery is done. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 14 -------- fs/ext4/fast_commit.h | 3 -- fs/ext4/super.c | 8 +++++ fs/jbd2/journal.c | 96 ++++++++++++++++++++++++++++++--------------------- include/linux/jbd2.h | 2 +- 5 files changed, 65 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 9399e9cccb7e..bab60c5d5095 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -2091,8 +2091,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, void ext4_fc_init(struct super_block *sb, journal_t *journal) { - int num_fc_blocks; - /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if @@ -2102,18 +2100,6 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; - if (!buffer_uptodate(journal->j_sb_buffer) - && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, - true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal"); - return; - } - num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks); - if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks : - EXT4_NUM_FC_BLKS)) { - pr_warn("Error while enabling fast commits, turning off."); - ext4_clear_feature_fast_commit(sb); - } } const char *fc_ineligible_reasons[] = { diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 140fbb6af19e..1d96e0ac8138 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,9 +3,6 @@ #ifndef __FAST_COMMIT_H__ #define __FAST_COMMIT_H__ -/* Number of blocks in journal area to allocate for fast commits */ -#define EXT4_NUM_FC_BLKS 256 - /* Fast commit tags */ #define EXT4_FC_TAG_ADD_RANGE 0x0001 #define EXT4_FC_TAG_DEL_RANGE 0x0002 diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a6dd433bb70..ba02d7c86fb3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4857,6 +4857,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto failed_mount_wq; + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c3c768248527..500152f0421a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1352,19 +1352,12 @@ static journal_t *journal_init_common(struct block_device *bdev, /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; + journal->j_fc_wbuf = NULL; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_wbuf) goto err_cleanup; - if (journal->j_fc_wbufsize > 0) { - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), - GFP_KERNEL); - if (!journal->j_fc_wbuf) - goto err_cleanup; - } - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: kfree(journal->j_wbuf); - kfree(journal->j_fc_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); return NULL; } -int jbd2_fc_init(journal_t *journal, int num_fc_blks) -{ - journal->j_fc_wbufsize = num_fc_blks; - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_fc_wbuf) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(jbd2_fc_init); - /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to @@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal) } journal->j_first = first; - - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { - journal->j_fc_last = last; - journal->j_last = last - journal->j_fc_wbufsize; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } else { - journal->j_last = last; - } + journal->j_last = last; journal->j_head = journal->j_first; journal->j_tail = journal->j_first; @@ -1533,6 +1505,13 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); + /* + * Now that journal recovery is done, turn fast commits off here. This + * way, if fast commit was enabled before the crash but if now FS has + * disabled it, we don't enable fast commits. + */ + jbd2_clear_feature_fast_commit(journal); + /* * As a special case, if the on-disk copy is already marked as needing * no recovery (s_start == 0), then we can safely defer the superblock @@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal) { int err; journal_superblock_t *sb; + int num_fc_blocks; err = journal_get_superblock(journal); if (err) @@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal) journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; + num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); + if (!num_fc_blocks) + num_fc_blocks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) + journal->j_last = journal->j_fc_last - num_fc_blocks; journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; - } else { - journal->j_last = be32_to_cpu(sb->s_maxlen); } return 0; @@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal) */ journal->j_flags &= ~JBD2_ABORT; - if (journal->j_fc_wbufsize > 0) - jbd2_journal_set_features(journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT); /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); - if (journal->j_fc_wbufsize > 0) - kfree(journal->j_fc_wbuf); + kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); @@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp return 0; } +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long num_fc_blks; + + num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); + if (num_fc_blks == 0) + num_fc_blks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) + return -ENOSPC; + + /* Are we called twice? */ + WARN_ON(journal->j_fc_wbuf != NULL); + journal->j_fc_wbuf = kmalloc_array(num_fc_blks, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_last = journal->j_last; + journal->j_last = journal->j_fc_last - num_fc_blks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + journal->j_free = journal->j_last - journal->j_first; + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); + + return 0; +} + /** * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. @@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb = journal->j_superblock; + if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { + if (jbd2_journal_initialize_fast_commit(journal)) { + pr_err("JBD2: Cannot enable fast commits.\n"); + return 0; + } + } + /* Load the checksum driver if necessary */ if ((journal->j_chksum_driver == NULL) && INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e0b6b53eae64..b2caf7bbd8e5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_MIN_FC_BLOCKS 256 #ifdef __KERNEL__ @@ -1614,7 +1615,6 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ -int jbd2_fc_init(journal_t *journal, int num_fc_blks); int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); -- cgit v1.2.3 From c460e5edc85a063ec9cb60addff93d00ed378701 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:57 -0800 Subject: jbd2: don't use state lock during commit path Variables journal->j_fc_off, journal->j_fc_wbuf are accessed during commit path. Since today we allow only one process to perform a fast commit, there is no need take state lock before accessing these variables. This patch removes these locks and adds comments to describe this. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 ------ include/linux/jbd2.h | 10 ++++++---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 500152f0421a..778ea50fc8d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -865,7 +865,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) int fc_off; *bh_out = NULL; - write_lock(&journal->j_state_lock); if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { fc_off = journal->j_fc_off; @@ -874,7 +873,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) } else { ret = -EINVAL; } - write_unlock(&journal->j_state_lock); if (ret) return ret; @@ -909,9 +907,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -939,9 +935,7 @@ int jbd2_fc_release_bufs(journal_t *journal) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b2caf7bbd8e5..5f0ef6380b0c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -945,8 +945,9 @@ struct journal_s /** * @j_fc_off: * - * Number of fast commit blocks currently allocated. - * [j_state_lock]. + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ unsigned long j_fc_off; @@ -1109,8 +1110,9 @@ struct journal_s struct buffer_head **j_wbuf; /** - * @j_fc_wbuf: Array of fast commit bhs for - * jbd2_journal_commit_transaction. + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; -- cgit v1.2.3 From 0bce577bf9cae13ae32d391432d0030e3f67fc1d Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:58 -0800 Subject: jbd2: don't pass tid to jbd2_fc_end_commit_fallback() In jbd2_fc_end_commit_fallback(), we know which tid to commit. There's no need for caller to pass it. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/jbd2/journal.c | 12 +++++++++--- include/linux/jbd2.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index bab60c5d5095..e69c580fa91e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1143,7 +1143,7 @@ out: "Fast commit ended with blks = %d, reason = %d, subtid - %d", nblks, reason, subtid); if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal, commit_tid); + return jbd2_fc_end_commit_fallback(journal); if (reason == EXT4_FC_REASON_FC_START_FAILED || reason == EXT4_FC_REASON_INELIGIBLE) return jbd2_complete_transaction(journal, commit_tid); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 778ea50fc8d1..59166e299cde 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -777,13 +777,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) int jbd2_fc_end_commit(journal_t *journal) { - return __jbd2_fc_end_commit(journal, 0, 0); + return __jbd2_fc_end_commit(journal, 0, false); } EXPORT_SYMBOL(jbd2_fc_end_commit); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +int jbd2_fc_end_commit_fallback(journal_t *journal) { - return __jbd2_fc_end_commit(journal, tid, 1); + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0; + read_unlock(&journal->j_state_lock); + return __jbd2_fc_end_commit(journal, tid, true); } EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5f0ef6380b0c..1c49fd62ff2e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1619,7 +1619,7 @@ extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From 556e0319fbb8eee3fa19fdcc27c8bcb4af1c7211 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:59:07 -0800 Subject: ext4: disable fast commit with data journalling Fast commits don't work with data journalling. This patch disables the fast commit support when data journalling is turned on. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201106035911.1942128-19-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 7 +++++++ fs/ext4/fast_commit.h | 1 + fs/ext4/super.c | 3 ++- include/trace/events/ext4.h | 6 ++++-- 4 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6b963e09af2c..fb9b4e9d82b2 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -483,6 +483,12 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (S_ISDIR(inode->i_mode)) return; + if (ext4_should_journal_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_INODE_JOURNAL_DATA); + return; + } + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(inode, ret); } @@ -2102,6 +2108,7 @@ const char *fc_ineligible_reasons[] = { "Resize", "Dir renamed", "Falloc range op", + "Data journalling", "FC Commit Failed" }; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 1d96e0ac8138..3a6e5a1fa1b8 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -102,6 +102,7 @@ enum { EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_COMMIT_FAILED, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ba02d7c86fb3..9e2e06ed2c45 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4340,9 +4340,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0f899d3b09d3..70ae5497b73a 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -104,7 +104,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ - { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}) + { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ + { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), @@ -2917,7 +2918,7 @@ TRACE_EVENT(ext4_fc_stats, ), TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " + "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), @@ -2928,6 +2929,7 @@ TRACE_EVENT(ext4_fc_stats, FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), __entry->sbi->s_fc_stats.fc_num_commits, __entry->sbi->s_fc_stats.fc_ineligible_commits, __entry->sbi->s_fc_stats.fc_numblks) -- cgit v1.2.3 From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:25:57 +0100 Subject: cpufreq: Introduce governor flags A new cpufreq governor flag will be added subsequently, so replace the bool dynamic_switching fleid in struct cpufreq_governor with a flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for the "dynamic switching" governors instead of it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/cpufreq_governor.h | 2 +- include/linux/cpufreq.h | 9 +++++++-- kernel/sched/cpufreq_schedutil.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 336b5e94cbc8..0252903f1b43 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2254,7 +2254,7 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) return -EINVAL; /* Platform doesn't want dynamic frequency switching ? */ - if (policy->governor->dynamic_switching && + if (policy->governor->flags & CPUFREQ_GOV_DYNAMIC_SWITCHING && cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING) { struct cpufreq_governor *gov = cpufreq_fallback_governor(); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index c56773c25757..bab8e6140377 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -156,7 +156,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy); #define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_) \ { \ .name = _name_, \ - .dynamic_switching = true, \ + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, \ .owner = THIS_MODULE, \ .init = cpufreq_dbs_governor_init, \ .exit = cpufreq_dbs_governor_exit, \ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1eaa04f1bae6..9bdfcf3c4748 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -570,12 +570,17 @@ struct cpufreq_governor { char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); - /* For governors which change frequency dynamically by themselves */ - bool dynamic_switching; struct list_head governor_list; struct module *owner; + u8 flags; }; +/* Governor flags */ + +/* For governors which change frequency dynamically by themselves */ +#define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) + + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d73bccde2720..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, -- cgit v1.2.3 From 218f66870181bec7aaa6e3c72f346039c590c3c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:10 +0100 Subject: cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET Introduce a new governor flag, CPUFREQ_GOV_STRICT_TARGET, for the governors that want the target frequency to be set exactly to the given value without leaving any room for adjustments on the hardware side and set this flag for the powersave and performance governors. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq_performance.c | 1 + drivers/cpufreq/cpufreq_powersave.c | 1 + include/linux/cpufreq.h | 3 +++ 3 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index 71c1d9aba772..addd93f2a420 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -20,6 +20,7 @@ static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy) static struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", .owner = THIS_MODULE, + .flags = CPUFREQ_GOV_STRICT_TARGET, .limits = cpufreq_gov_performance_limits, }; diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 7749522355b5..8d830d860e91 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -21,6 +21,7 @@ static struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", .limits = cpufreq_gov_powersave_limits, .owner = THIS_MODULE, + .flags = CPUFREQ_GOV_STRICT_TARGET, }; MODULE_AUTHOR("Dominik Brodowski "); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bdfcf3c4748..6eb9a3b8ec7b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -580,6 +580,9 @@ struct cpufreq_governor { /* For governors which change frequency dynamically by themselves */ #define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) +/* For governors wanting the target frequency to be set exactly */ +#define CPUFREQ_GOV_STRICT_TARGET BIT(1) + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, -- cgit v1.2.3 From ea9364bbadf11f0c55802cf11387d74f524cee84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:37 +0100 Subject: cpufreq: Add strict_target to struct cpufreq_policy Add a new field to be set when the CPUFREQ_GOV_STRICT_TARGET flag is set for the current governor to struct cpufreq_policy, so that the drivers needing to check CPUFREQ_GOV_STRICT_TARGET do not have to access the governor object during every frequency transition. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 ++ include/linux/cpufreq.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0252903f1b43..1e7e3f2ff09f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2280,6 +2280,8 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) } } + policy->strict_target = !!(policy->governor->flags & CPUFREQ_GOV_STRICT_TARGET); + return 0; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6eb9a3b8ec7b..acbad3b36322 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -109,6 +109,12 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current + * governor. + */ + bool strict_target; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the -- cgit v1.2.3