From d82076403cef7fcd1e7617c9db48bf21ebdc1f9c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:10 -0600 Subject: iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Add iov_iter_get_pages_flags() and iov_iter_get_pages_alloc_flags() which take a flags argument that is passed to get_user_pages_fast(). This is so that FOLL_PCI_P2PDMA can be passed when appropriate. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221021174116.7200-4-logang@deltatee.com Signed-off-by: Jens Axboe --- lib/iov_iter.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c3ca28ca68a6..53efad017f3c 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1430,7 +1430,8 @@ static struct page *first_bvec_segment(const struct iov_iter *i, static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - unsigned int maxpages, size_t *start) + unsigned int maxpages, size_t *start, + unsigned int gup_flags) { unsigned int n; @@ -1442,7 +1443,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { - unsigned int gup_flags = 0; unsigned long addr; int res; @@ -1492,33 +1492,49 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return -EFAULT; } -ssize_t iov_iter_get_pages2(struct iov_iter *i, +ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) + size_t *start, unsigned gup_flags) { if (!maxpages) return 0; BUG_ON(!pages); - return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); + return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, + start, gup_flags); +} +EXPORT_SYMBOL_GPL(iov_iter_get_pages); + +ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned maxpages, size_t *start) +{ + return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0); } EXPORT_SYMBOL(iov_iter_get_pages2); -ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - size_t *start) + size_t *start, unsigned gup_flags) { ssize_t len; *pages = NULL; - len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); + len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, + gup_flags); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } +EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc); + +ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, + struct page ***pages, size_t maxsize, size_t *start) +{ + return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0); +} EXPORT_SYMBOL(iov_iter_get_pages_alloc2); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, -- cgit v1.2.3 From 1567b49d1a4081ba7e1a0ff2dc39bc58c59f2a51 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:12 -0600 Subject: lib/scatterlist: add check when merging zone device pages Consecutive zone device pages should not be merged into the same sgl or bvec segment with other types of pages or if they belong to different pgmaps. Otherwise getting the pgmap of a given segment is not possible without scanning the entire segment. This helper returns true either if both pages are not zone device pages or both pages are zone device pages with the same pgmap. Factor out the check for page mergability into a pages_are_mergable() helper and add a check with zone_device_pages_are_mergeable(). Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-6-logang@deltatee.com Signed-off-by: Jens Axboe --- lib/scatterlist.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c8c3d675845c..a0ad2a7959b5 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -410,6 +410,15 @@ static struct scatterlist *get_next_sg(struct sg_append_table *table, return new_sg; } +static bool pages_are_mergeable(struct page *a, struct page *b) +{ + if (page_to_pfn(a) != page_to_pfn(b) + 1) + return false; + if (!zone_device_pages_have_same_pgmap(a, b)) + return false; + return true; +} + /** * sg_alloc_append_table_from_pages - Allocate and initialize an append sg * table from an array of pages @@ -447,6 +456,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, unsigned int chunks, cur_page, seg_len, i, prv_len = 0; unsigned int added_nents = 0; struct scatterlist *s = sgt_append->prv; + struct page *last_pg; /* * The algorithm below requires max_segment to be aligned to PAGE_SIZE @@ -460,21 +470,17 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, return -EOPNOTSUPP; if (sgt_append->prv) { - unsigned long paddr = - (page_to_pfn(sg_page(sgt_append->prv)) * PAGE_SIZE + - sgt_append->prv->offset + sgt_append->prv->length) / - PAGE_SIZE; - if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; - while (n_pages && page_to_pfn(pages[0]) == paddr) { + last_pg = sg_page(sgt_append->prv); + while (n_pages && pages_are_mergeable(last_pg, pages[0])) { if (sgt_append->prv->length + PAGE_SIZE > max_segment) break; sgt_append->prv->length += PAGE_SIZE; - paddr++; + last_pg = pages[0]; pages++; n_pages--; } @@ -488,7 +494,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, for (i = 1; i < n_pages; i++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || - page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) { + !pages_are_mergeable(pages[i], pages[i - 1])) { chunks++; seg_len = 0; } @@ -504,8 +510,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, for (j = cur_page + 1; j < n_pages; j++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || - page_to_pfn(pages[j]) != - page_to_pfn(pages[j - 1]) + 1) + !pages_are_mergeable(pages[j], pages[j - 1])) break; } -- cgit v1.2.3 From 4f8126bb2308066b877859e4b5923ffb54143630 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 5 Nov 2022 19:10:55 -0400 Subject: sbitmap: Use single per-bitmap counting to wake up queued tags sbitmap suffers from code complexity, as demonstrated by recent fixes, and eventual lost wake ups on nested I/O completion. The later happens, from what I understand, due to the non-atomic nature of the updates to wait_cnt, which needs to be subtracted and eventually reset when equal to zero. This two step process can eventually miss an update when a nested completion happens to interrupt the CPU in between the wait_cnt updates. This is very hard to fix, as shown by the recent changes to this code. The code complexity arises mostly from the corner cases to avoid missed wakes in this scenario. In addition, the handling of wake_batch recalculation plus the synchronization with sbq_queue_wake_up is non-trivial. This patchset implements the idea originally proposed by Jan [1], which removes the need for the two-step updates of wait_cnt. This is done by tracking the number of completions and wakeups in always increasing, per-bitmap counters. Instead of having to reset the wait_cnt when it reaches zero, we simply keep counting, and attempt to wake up N threads in a single wait queue whenever there is enough space for a batch. Waking up less than batch_wake shouldn't be a problem, because we haven't changed the conditions for wake up, and the existing batch calculation guarantees at least enough remaining completions to wake up a batch for each queue at any time. Performance-wise, one should expect very similar performance to the original algorithm for the case where there is no queueing. In both the old algorithm and this implementation, the first thing is to check ws_active, which bails out if there is no queueing to be managed. In the new code, we took care to avoid accounting completions and wakeups when there is no queueing, to not pay the cost of atomic operations unnecessarily, since it doesn't skew the numbers. For more interesting cases, where there is queueing, we need to take into account the cross-communication of the atomic operations. I've been benchmarking by running parallel fio jobs against a single hctx nullb in different hardware queue depth scenarios, and verifying both IOPS and queueing. Each experiment was repeated 5 times on a 20-CPU box, with 20 parallel jobs. fio was issuing fixed-size randwrites with qd=64 against nullb, varying only the hardware queue length per test. queue size 2 4 8 16 32 64 6.1-rc2 1681.1K (1.6K) 2633.0K (12.7K) 6940.8K (16.3K) 8172.3K (617.5K) 8391.7K (367.1K) 8606.1K (351.2K) patched 1721.8K (15.1K) 3016.7K (3.8K) 7543.0K (89.4K) 8132.5K (303.4K) 8324.2K (230.6K) 8401.8K (284.7K) The following is a similar experiment, ran against a nullb with a single bitmap shared by 20 hctx spread across 2 NUMA nodes. This has 40 parallel fio jobs operating on the same device queue size 2 4 8 16 32 64 6.1-rc2 1081.0K (2.3K) 957.2K (1.5K) 1699.1K (5.7K) 6178.2K (124.6K) 12227.9K (37.7K) 13286.6K (92.9K) patched 1081.8K (2.8K) 1316.5K (5.4K) 2364.4K (1.8K) 6151.4K (20.0K) 11893.6K (17.5K) 12385.6K (18.4K) It has also survived blktests and a 12h-stress run against nullb. I also ran the code against nvme and a scsi SSD, and I didn't observe performance regression in those. If there are other tests you think I should run, please let me know and I will follow up with results. [1] https://lore.kernel.org/all/aef9de29-e9f5-259a-f8be-12d1b734e72@google.com/ Cc: Hugh Dickins Cc: Keith Busch Cc: Liu Song Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221105231055.25953-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 16 +++++-- lib/sbitmap.c | 122 +++++++++++------------------------------------- 2 files changed, 37 insertions(+), 101 deletions(-) (limited to 'lib') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 4d2d5205ab58..d662cf136021 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -86,11 +86,6 @@ struct sbitmap { * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { - /** - * @wait_cnt: Number of frees remaining before we wake up. - */ - atomic_t wait_cnt; - /** * @wait: Wait queue. */ @@ -138,6 +133,17 @@ struct sbitmap_queue { * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; + + /** + * @completion_cnt: Number of bits cleared passed to the + * wakeup function. + */ + atomic_t completion_cnt; + + /** + * @wakeup_cnt: Number of thread wake ups issued. + */ + atomic_t wakeup_cnt; }; /** diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 7280ae8ca88c..eca462cba398 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -434,6 +434,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); + atomic_set(&sbq->completion_cnt, 0); + atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { @@ -441,40 +443,21 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, return -ENOMEM; } - for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); - atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); - } return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int wake_batch) -{ - int i; - - if (sbq->wake_batch != wake_batch) { - WRITE_ONCE(sbq->wake_batch, wake_batch); - /* - * Pairs with the memory barrier in sbitmap_queue_wake_up() - * to ensure that the batch size is updated before the wait - * counts. - */ - smp_mb(); - for (i = 0; i < SBQ_WAIT_QUEUES; i++) - atomic_set(&sbq->ws[i].wait_cnt, 1); - } -} - static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + if (sbq->wake_batch != wake_batch) + WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, @@ -488,7 +471,8 @@ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, min_batch, SBQ_WAKE_BATCH); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + + WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); @@ -587,7 +571,7 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; - if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { + if (waitqueue_active(&ws->wait)) { if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); return ws; @@ -599,83 +583,31 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) return NULL; } -static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { - struct sbq_wait_state *ws; - unsigned int wake_batch; - int wait_cnt, cur, sub; - bool ret; + unsigned int wake_batch = READ_ONCE(sbq->wake_batch); + struct sbq_wait_state *ws = NULL; + unsigned int wakeups; - if (*nr <= 0) - return false; + if (!atomic_read(&sbq->ws_active)) + return; - ws = sbq_wake_ptr(sbq); - if (!ws) - return false; + atomic_add(nr, &sbq->completion_cnt); + wakeups = atomic_read(&sbq->wakeup_cnt); - cur = atomic_read(&ws->wait_cnt); do { - /* - * For concurrent callers of this, callers should call this - * function again to wakeup a new batch on a different 'ws'. - */ - if (cur == 0) - return true; - sub = min(*nr, cur); - wait_cnt = cur - sub; - } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); - - /* - * If we decremented queue without waiters, retry to avoid lost - * wakeups. - */ - if (wait_cnt > 0) - return !waitqueue_active(&ws->wait); + if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) + return; - *nr -= sub; - - /* - * When wait_cnt == 0, we have to be particularly careful as we are - * responsible to reset wait_cnt regardless whether we've actually - * woken up anybody. But in case we didn't wakeup anybody, we still - * need to retry. - */ - ret = !waitqueue_active(&ws->wait); - wake_batch = READ_ONCE(sbq->wake_batch); + if (!ws) { + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + } + } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, + &wakeups, wakeups + wake_batch)); - /* - * Wake up first in case that concurrent callers decrease wait_cnt - * while waitqueue is empty. - */ wake_up_nr(&ws->wait, wake_batch); - - /* - * Pairs with the memory barrier in sbitmap_queue_resize() to - * ensure that we see the batch size update before the wait - * count is reset. - * - * Also pairs with the implicit barrier between decrementing wait_cnt - * and checking for waitqueue_active() to make sure waitqueue_active() - * sees result of the wakeup if atomic_dec_return() has seen the result - * of atomic_set(). - */ - smp_mb__before_atomic(); - - /* - * Increase wake_index before updating wait_cnt, otherwise concurrent - * callers can see valid wait_cnt in old waitqueue, which can cause - * invalid wakeup on the old waitqueue. - */ - sbq_index_atomic_inc(&sbq->wake_index); - atomic_set(&ws->wait_cnt, wake_batch); - - return ret || *nr; -} - -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) -{ - while (__sbq_wake_up(sbq, &nr)) - ; } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); @@ -792,9 +724,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; - - seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", - atomic_read(&ws->wait_cnt), + seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); -- cgit v1.2.3 From 42271ca389edb0446b9e492858b4c38083b0b9f8 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Wed, 19 Oct 2022 18:04:07 +0200 Subject: lib/raid6: drop RAID6_USE_EMPTY_ZERO_PAGE RAID6_USE_EMPTY_ZERO_PAGE is unused and hardcoded to 0, so let's drop it. Signed-off-by: Giulio Benetti Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- include/linux/raid/pq.h | 8 -------- lib/raid6/algos.c | 2 -- 2 files changed, 10 deletions(-) (limited to 'lib') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index d6e5a1feb947..f29aaaf2eb21 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -10,17 +10,9 @@ #ifdef __KERNEL__ -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif #else /* ! __KERNEL__ */ /* Used for testing in user space */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 39b74221f4a7..a22a05c9af8a 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,12 +18,10 @@ #else #include #include -#if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); EXPORT_SYMBOL(raid6_empty_zero_page); #endif -#endif struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); -- cgit v1.2.3 From 976570b4ecd30d3ec6e1b0910da8e5edc591f2b6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:51 -0500 Subject: sbitmap: Advance the queue index before waking up a queue When a queue is awaken, the wake_index written by sbq_wake_ptr currently keeps pointing to the same queue. On the next wake up, it will thus retry the same queue, which is unfair to other queues, and can lead to starvation. This patch, moves the index update to happen before the queue is returned, such that it will now try a different queue first on the next wake up, improving fairness. Fixes: 4f8126bb2308 ("sbitmap: Use single per-bitmap counting to wake up queued tags") Reported-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221115224553.23594-2-krisman@suse.de Signed-off-by: Jens Axboe --- lib/sbitmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/sbitmap.c b/lib/sbitmap.c index eca462cba398..bea7984f7987 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -571,13 +571,19 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; + /* + * Advance the index before checking the current queue. + * It improves fairness, by ensuring the queue doesn't + * need to be fully emptied before trying to wake up + * from the next one. + */ + wake_index = sbq_index_inc(wake_index); + if (waitqueue_active(&ws->wait)) { if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); return ws; } - - wake_index = sbq_index_inc(wake_index); } return NULL; -- cgit v1.2.3 From 26edb30dd1c0c9be11fa676b4f330ada7b794ba6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:53 -0500 Subject: sbitmap: Try each queue to wake up at least one waiter Jan reported the new algorithm as merged might be problematic if the queue being awaken becomes empty between the waitqueue_active inside sbq_wake_ptr check and the wake up. If that happens, wake_up_nr will not wake up any waiter and we loose too many wake ups. In order to guarantee progress, we need to wake up at least one waiter here, if there are any. This now requires trying to wake up from every queue. Instead of walking through all the queues with sbq_wake_ptr, this call moves the wake up inside that function. In a previous version of the patch, I found that updating wake_index several times when walking through queues had a measurable overhead. This ensures we only update it once, at the end. Fixes: 4f8126bb2308 ("sbitmap: Use single per-bitmap counting to wake up queued tags") Reported-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221115224553.23594-4-krisman@suse.de Signed-off-by: Jens Axboe --- lib/sbitmap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/sbitmap.c b/lib/sbitmap.c index bea7984f7987..586deb333237 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -560,12 +560,12 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); -static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index; if (!atomic_read(&sbq->ws_active)) - return NULL; + return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { @@ -579,20 +579,22 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) */ wake_index = sbq_index_inc(wake_index); - if (waitqueue_active(&ws->wait)) { - if (wake_index != atomic_read(&sbq->wake_index)) - atomic_set(&sbq->wake_index, wake_index); - return ws; - } + /* + * It is sufficient to wake up at least one waiter to + * guarantee forward progress. + */ + if (waitqueue_active(&ws->wait) && + wake_up_nr(&ws->wait, nr)) + break; } - return NULL; + if (wake_index != atomic_read(&sbq->wake_index)) + atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); - struct sbq_wait_state *ws = NULL; unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) @@ -604,16 +606,10 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; - - if (!ws) { - ws = sbq_wake_ptr(sbq); - if (!ws) - return; - } } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); - wake_up_nr(&ws->wait, wake_batch); + __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); -- cgit v1.2.3 From f2d03d89615ef65b5dff3aae6581df0b5fcbaa3b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 22 Nov 2022 14:42:58 +0100 Subject: lru_cache: use atomic operations when accessing lc->flags, always MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Or, depending on the way locking is implemented at the call sites, some updates could be lost (has not been observed). Signed-off-by: Lars Ellenberg Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20221122134301.69258-2-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- lib/lru_cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/lru_cache.c b/lib/lru_cache.c index dc35464216d3..fec899386238 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -364,7 +364,7 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsig struct lc_element *e; PARANOIA_ENTRY(); - if (lc->flags & LC_STARVING) { + if (test_bit(__LC_STARVING, &lc->flags)) { ++lc->starving; RETURN(NULL); } @@ -417,7 +417,7 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsig * the LRU element, we have to wait ... */ if (!lc_unused_element_available(lc)) { - __set_bit(__LC_STARVING, &lc->flags); + set_bit(__LC_STARVING, &lc->flags); RETURN(NULL); } -- cgit v1.2.3 From 9933438430b3b787f96bb434b4490b0dda59c9b3 Mon Sep 17 00:00:00 2001 From: Christoph Böhmwalder Date: Tue, 22 Nov 2022 14:42:59 +0100 Subject: lru_cache: remove compiled out code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20221122134301.69258-3-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- lib/lru_cache.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'lib') diff --git a/lib/lru_cache.c b/lib/lru_cache.c index fec899386238..5dd5e4c00a23 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -60,17 +60,6 @@ int lc_try_lock(struct lru_cache *lc) } while (unlikely (val == LC_PARANOIA)); /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ return 0 == val; -#if 0 - /* Alternative approach, spin in case someone enters or leaves a - * PARANOIA_ENTRY()/RETURN() section. */ - unsigned long old, new, val; - do { - old = lc->flags & LC_PARANOIA; - new = old | LC_LOCKED; - val = cmpxchg(&lc->flags, old, new); - } while (unlikely (val == (old ^ LC_PARANOIA))); - return old == val; -#endif } /** -- cgit v1.2.3 From 2cd10a496a86787367716b684dadfecbb594095b Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Tue, 22 Nov 2022 14:43:00 +0100 Subject: lru_cache: remove unused lc_private, lc_set, lc_index_of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Joel Colledge Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20221122134301.69258-4-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 3 --- lib/lru_cache.c | 44 -------------------------------------------- 2 files changed, 47 deletions(-) (limited to 'lib') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 07add7882a5d..c9afcdd9324c 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -199,7 +199,6 @@ struct lru_cache { unsigned long flags; - void *lc_private; const char *name; /* nr_elements there */ @@ -241,7 +240,6 @@ extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); -extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); @@ -297,6 +295,5 @@ extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); container_of(ptr, type, member) extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); -extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); #endif diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 5dd5e4c00a23..b3d9187611de 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -574,48 +574,6 @@ struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i) return lc->lc_element[i]; } -/** - * lc_index_of - * @lc: the lru cache to operate on - * @e: the element to query for its index position in lc->element - */ -unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) -{ - PARANOIA_LC_ELEMENT(lc, e); - return e->lc_index; -} - -/** - * lc_set - associate index with label - * @lc: the lru cache to operate on - * @enr: the label to set - * @index: the element index to associate label with. - * - * Used to initialize the active set to some previously recorded state. - */ -void lc_set(struct lru_cache *lc, unsigned int enr, int index) -{ - struct lc_element *e; - struct list_head *lh; - - if (index < 0 || index >= lc->nr_elements) - return; - - e = lc_element_by_index(lc, index); - BUG_ON(e->lc_number != e->lc_new_number); - BUG_ON(e->refcnt != 0); - - e->lc_number = e->lc_new_number = enr; - hlist_del_init(&e->colision); - if (enr == LC_FREE) - lh = &lc->free; - else { - hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); - lh = &lc->lru; - } - list_move(&e->list, lh); -} - /** * lc_seq_dump_details - Dump a complete LRU cache to seq in textual form. * @lc: the lru cache to operate on @@ -650,7 +608,6 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext EXPORT_SYMBOL(lc_create); EXPORT_SYMBOL(lc_reset); EXPORT_SYMBOL(lc_destroy); -EXPORT_SYMBOL(lc_set); EXPORT_SYMBOL(lc_del); EXPORT_SYMBOL(lc_try_get); EXPORT_SYMBOL(lc_find); @@ -658,7 +615,6 @@ EXPORT_SYMBOL(lc_get); EXPORT_SYMBOL(lc_put); EXPORT_SYMBOL(lc_committed); EXPORT_SYMBOL(lc_element_by_index); -EXPORT_SYMBOL(lc_index_of); EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); EXPORT_SYMBOL(lc_try_lock); -- cgit v1.2.3