From 541be05095437a7e5e08e7d13a13e03ec0994ae7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 25 Sep 2019 16:45:56 -0700 Subject: linux/coff.h: add include guard Add a header include guard just in case. My motivation is to allow Kbuild to detect missing include guard: https://patchwork.kernel.org/patch/11063011/ Before I enable this checker I want to fix as many headers as possible. Link: http://lkml.kernel.org/r/20190728154728.11126-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/coff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h index e4a79f80b9a0..ab5c7e847eed 100644 --- a/include/uapi/linux/coff.h +++ b/include/uapi/linux/coff.h @@ -11,6 +11,9 @@ more information about COFF, then O'Reilly has a very excellent book. */ +#ifndef _UAPI_LINUX_COFF_H +#define _UAPI_LINUX_COFF_H + #define E_SYMNMLEN 8 /* Number of characters in a symbol name */ #define E_FILNMLEN 14 /* Number of characters in a file name */ #define E_DIMNUM 4 /* Number of array dimensions in auxiliary entry */ @@ -350,3 +353,5 @@ struct COFF_reloc { /* For new sections we haven't heard of before */ #define COFF_DEF_SECTION_ALIGNMENT 4 + +#endif /* _UAPI_LINUX_COFF_H */ -- cgit v1.2.3 From 444b8a83f1e01584ff2d53f5951d8e836c0070b5 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:04 -0700 Subject: augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro Patch series "make RB_DECLARE_CALLBACKS more generic", v3. These changes are intended to make the RB_DECLARE_CALLBACKS macro more generic (allowing the aubmented subtree information to be a struct instead of a scalar). I have verified the compiled lib/interval_tree.o and mm/mmap.o files to check that they didn't change. This held as expected for interval_tree.o; mmap.o did have some changes which could be reverted by marking __vma_link_rb as noinline. I did not add such a change to the patchset; I felt it was reasonable enough to leave the inlining decision up to the compiler. This patch (of 3): Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS. The arguments are also now capitalized. This copies the style of the INTERVAL_TREE_DEFINE macro. No functional changes in this commit, only comments and capitalization. Link: http://lkml.kernel.org/r/20190703040156.56953-2-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 54 +++++++++++++++++++++------------- tools/include/linux/rbtree_augmented.h | 54 +++++++++++++++++++++------------- 2 files changed, 66 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 179faab29f52..979941600082 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,39 +60,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 467a3eefe1d2..de3a480204ba 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -62,39 +62,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; -- cgit v1.2.3 From 315cc066b8ae8349a27887ad7a34e1916e9797fe Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:07 -0700 Subject: augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro Add RB_DECLARE_CALLBACKS_MAX, which generates augmented rbtree callbacks for the case where the augmented value is a scalar whose definition follows a max(f(node)) pattern. This actually covers all present uses of RB_DECLARE_CALLBACKS, and saves some (source) code duplication in the various RBCOMPUTE function definitions. [walken@google.com: fix mm/vmalloc.c] Link: http://lkml.kernel.org/r/CANN689FXgK13wDYNh1zKxdipeTuALG4eKvKpsdZqKFJ-rvtGiQ@mail.gmail.com [walken@google.com: re-add check to check_augmented()] Link: http://lkml.kernel.org/r/20190727022027.GA86863@google.com Link: http://lkml.kernel.org/r/20190703040156.56953-3-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 19 +++-------------- drivers/block/drbd/drbd_interval.c | 29 +++----------------------- include/linux/interval_tree_generic.h | 22 ++------------------ include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++++++++++- lib/rbtree_test.c | 37 ++++++++++++++++------------------ mm/mmap.c | 29 ++++++++++++++++---------- mm/vmalloc.c | 5 ++--- tools/include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++++++++++- 8 files changed, 115 insertions(+), 98 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index fa16036fa592..65ebe4b88f7c 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} +#define NODE_END(node) ((node)->end) -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) +RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, + struct memtype, rb, u64, subtree_max_end, NODE_END) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index c58986556161..651bd0236a99 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node) return this->end; } -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); - - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} +#define NODE_END(node) ((node)->sector + ((node)->size >> 9)) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, - sector_t, end, compute_subtree_last); +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct drbd_interval, rb, sector_t, end, NODE_END); /** * drbd_insert_interval - insert a new interval into a tree diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 855476145fe1..aaa8a0767aa3 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -30,26 +30,8 @@ \ /* Callbacks for augmented rbtree insert and remove */ \ \ -static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ -{ \ - ITTYPE max = ITLAST(node), subtree_last; \ - if (node->ITRB.rb_left) { \ - subtree_last = rb_entry(node->ITRB.rb_left, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - if (node->ITRB.rb_right) { \ - subtree_last = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - return max; \ -} \ - \ -RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ - ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ \ /* Insert / remove interval nodes from the tree */ \ \ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 979941600082..e5937e387e02 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -61,7 +61,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -107,6 +107,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 62b8ee92643d..41ae3c7570d3 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -77,26 +77,10 @@ static inline void erase_cached(struct test_node *node, struct rb_root_cached *r } -static inline u32 augment_recompute(struct test_node *node) -{ - u32 max = node->val, child_augmented; - if (node->rb.rb_left) { - child_augmented = rb_entry(node->rb.rb_left, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - if (node->rb.rb_right) { - child_augmented = rb_entry(node->rb.rb_right, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - return max; -} +#define NODE_VAL(node) ((node)->val) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, - u32, augmented, augment_recompute) +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct test_node, rb, u32, augmented, NODE_VAL) static void insert_augmented(struct test_node *node, struct rb_root_cached *root) @@ -238,7 +222,20 @@ static void check_augmented(int nr_nodes) check(nr_nodes); for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) { struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); + u32 subtree, max = node->val; + if (node->rb.rb_left) { + subtree = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + if (node->rb.rb_right) { + subtree = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + WARN_ON_ONCE(node->augmented != max); } } diff --git a/mm/mmap.c b/mm/mmap.c index f1e8c7f93e04..14b7da317ec0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -289,9 +289,9 @@ out: return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -299,14 +299,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -322,7 +329,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -428,8 +434,9 @@ static void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -439,8 +446,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exactly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fcadd3e25c0c..a3c70e275f4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -396,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va) get_subtree_max_size(va->rb_node.rb_right)); } -RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, - struct vmap_area, rb_node, unsigned long, subtree_max_size, - compute_subtree_max_size) +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index de3a480204ba..4e8c4c76e9a2 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -63,7 +63,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -109,6 +109,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 -- cgit v1.2.3 From 6d2052d188d962ffb7ad3d413e6ffd5f276aec94 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:10 -0700 Subject: augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition Change the definition of the RBCOMPUTE function. The propagate callback repeatedly calls RBCOMPUTE as it moves from leaf to root. it wants to stop recomputing once the augmented subtree information doesn't change. This was previously checked using the == operator, but that only works when the augmented subtree information is a scalar field. This commit modifies the RBCOMPUTE function so that it now sets the augmented subtree information instead of returning it, and returns a boolean value indicating if the propagate callback should stop. The motivation for this change is that I want to introduce augmented rbtree uses where the augmented data for the subtree is a struct instead of a scalar. Link: http://lkml.kernel.org/r/20190703040156.56953-4-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 24 ++++++++++++------------ tools/include/linux/rbtree_augmented.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index e5937e387e02..fdd421b8d9ae 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -67,22 +67,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -99,7 +96,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -122,7 +119,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -136,10 +133,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 4e8c4c76e9a2..381aa948610d 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -69,22 +69,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -101,7 +98,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -124,7 +121,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -138,10 +135,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 -- cgit v1.2.3 From 917cda2790c4bd624c5191b8d9edd12121749e86 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:13 -0700 Subject: kernel-doc: core-api: include string.h into core-api core-api should show all the various string functions including the newly added stracpy and stracpy_pad. Miscellanea: o Update the Returns: value for strscpy o fix a defect with %NUL) [joe@perches.com: correct return of -E2BIG descriptions] Link: http://lkml.kernel.org/r/29f998b4c1a9d69fbeae70500ba0daa4b340c546.1563889130.git.joe@perches.com Link: http://lkml.kernel.org/r/224a6ebf39955f4107c0c376d66155d970e46733.1563841972.git.joe@perches.com Signed-off-by: Joe Perches Reviewed-by: Kees Cook Cc: Jonathan Corbet Cc: Stephen Kitt Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 3 +++ include/linux/string.h | 5 +++-- lib/string.c | 10 ++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 08af5caf036d..f77de49b1d51 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -42,6 +42,9 @@ String Manipulation .. kernel-doc:: lib/string.c :export: +.. kernel-doc:: include/linux/string.h + :internal: + .. kernel-doc:: mm/util.c :functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user vmemdup_user strndup_user memdup_user_nul diff --git a/include/linux/string.h b/include/linux/string.h index 4deb11f7976b..b2f9df7f0761 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -474,8 +474,9 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * - * Returns: 0 if @str does not start with @prefix - strlen(@prefix) if @str does start with @prefix + * Returns: + * * strlen(@prefix) if @str starts with @prefix + * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { diff --git a/lib/string.c b/lib/string.c index 461fb620f85f..f7bc10da4259 100644 --- a/lib/string.c +++ b/lib/string.c @@ -173,8 +173,9 @@ EXPORT_SYMBOL(strlcpy); * doesn't unnecessarily force the tail of the destination buffer to be * zeroed. If zeroing is desired please use strscpy_pad(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy(char *dest, const char *src, size_t count) { @@ -253,8 +254,9 @@ EXPORT_SYMBOL(strscpy); * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy_pad(char *dest, const char *src, size_t count) { -- cgit v1.2.3 From d1a445d3b86c9341ce7a0954c23be0edb5c9bec5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 25 Sep 2019 16:46:16 -0700 Subject: include/trace/events/writeback.h: fix -Wstringop-truncation warnings There are many of those warnings. In file included from ./arch/powerpc/include/asm/paca.h:15, from ./arch/powerpc/include/asm/current.h:13, from ./include/linux/thread_info.h:21, from ./include/asm-generic/preempt.h:5, from ./arch/powerpc/include/generated/asm/preempt.h:1, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from fs/fs-writeback.c:19: In function 'strncpy', inlined from 'perf_trace_writeback_page_template' at ./include/trace/events/writeback.h:56:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix it by using the new strscpy_pad() which was introduced in "lib/string: Add strscpy_pad() function" and will always be NUL-terminated instead of strncpy(). Also, change strlcpy() to use strscpy_pad() in this file for consistency. Link: http://lkml.kernel.org/r/1564075099-27750-1-git-send-email-cai@lca.pw Fixes: 455b2864686d ("writeback: Initial tracing support") Fixes: 028c2dd184c0 ("writeback: Add tracing to balance_dirty_pages") Fixes: e84d0a4f8e39 ("writeback: trace event writeback_queue_io") Fixes: b48c104d2211 ("writeback: trace event bdi_dirty_ratelimit") Fixes: cc1676d917f3 ("writeback: Move requeueing when I_SYNC set to writeback_sb_inodes()") Fixes: 9fb0a7da0c52 ("writeback: add more tracepoints") Signed-off-by: Qian Cai Reviewed-by: Jan Kara Cc: Tobin C. Harding Cc: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Tejun Heo Cc: Dave Chinner Cc: Fengguang Wu Cc: Jens Axboe Cc: Joe Perches Cc: Kees Cook Cc: Jann Horn Cc: Jonathan Corbet Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3a27335fce2c..c2ce6480b4b1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -66,8 +66,9 @@ DECLARE_EVENT_CLASS(writeback_page_template, ), TP_fast_assign( - strncpy(__entry->name, - mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", + 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -110,8 +111,8 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ - strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; @@ -316,8 +317,8 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); @@ -360,8 +361,9 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, - wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + wb->bdi->dev ? dev_name(wb->bdi->dev) : + "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -414,7 +416,7 @@ DECLARE_EVENT_CLASS(writeback_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%u", @@ -436,7 +438,7 @@ TRACE_EVENT(writeback_bdi_register, __array(char, name, 32) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); ), TP_printk("bdi %s", __entry->name @@ -461,7 +463,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; @@ -512,7 +514,7 @@ TRACE_EVENT(writeback_queue_io, ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; @@ -598,7 +600,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); @@ -663,7 +665,7 @@ TRACE_EVENT(balance_dirty_pages, TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -723,8 +725,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; @@ -797,8 +799,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; -- cgit v1.2.3 From 091cb0994edd20d67521094ac9c6ec9804058d9a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 25 Sep 2019 16:46:29 -0700 Subject: lib/hexdump: make print_hex_dump_bytes() a nop on !DEBUG builds I'm seeing a bunch of debug prints from a user of print_hex_dump_bytes() in my kernel logs, but I don't have CONFIG_DYNAMIC_DEBUG enabled nor do I have DEBUG defined in my build. The problem is that print_hex_dump_bytes() calls a wrapper function in lib/hexdump.c that calls print_hex_dump() with KERN_DEBUG level. There are three cases to consider here 1. CONFIG_DYNAMIC_DEBUG=y --> call dynamic_hex_dum() 2. CONFIG_DYNAMIC_DEBUG=n && DEBUG --> call print_hex_dump() 3. CONFIG_DYNAMIC_DEBUG=n && !DEBUG --> stub it out Right now, that last case isn't detected and we still call print_hex_dump() from the stub wrapper. Let's make print_hex_dump_bytes() only call print_hex_dump_debug() so that it works properly in all cases. Case #1, print_hex_dump_debug() calls dynamic_hex_dump() and we get same behavior. Case #2, print_hex_dump_debug() calls print_hex_dump() with KERN_DEBUG and we get the same behavior. Case #3, print_hex_dump_debug() is a nop, changing behavior to what we want, i.e. print nothing. Link: http://lkml.kernel.org/r/20190816235624.115280-1-swboyd@chromium.org Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 22 +++++++++++++++------- lib/hexdump.c | 21 --------------------- 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index cefd374c47b1..c09d67edda3a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -488,13 +488,6 @@ extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); -#if defined(CONFIG_DYNAMIC_DEBUG) -#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ - dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true) -#else -extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len); -#endif /* defined(CONFIG_DYNAMIC_DEBUG) */ #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, @@ -526,4 +519,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif +/** + * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params + * @prefix_str: string to prefix each line with; + * caller supplies trailing spaces for alignment if desired + * @prefix_type: controls whether prefix of an offset, address, or none + * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) + * @buf: data blob to dump + * @len: number of bytes in the @buf + * + * Calls print_hex_dump(), with log level of KERN_DEBUG, + * rowsize of 16, groupsize of 1, and ASCII output included. + */ +#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ + print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) + #endif diff --git a/lib/hexdump.c b/lib/hexdump.c index b1d55b669ae2..147133f8eb2f 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -270,25 +270,4 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, } EXPORT_SYMBOL(print_hex_dump); -#if !defined(CONFIG_DYNAMIC_DEBUG) -/** - * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params - * @prefix_str: string to prefix each line with; - * caller supplies trailing spaces for alignment if desired - * @prefix_type: controls whether prefix of an offset, address, or none - * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) - * @buf: data blob to dump - * @len: number of bytes in the @buf - * - * Calls print_hex_dump(), with log level of KERN_DEBUG, - * rowsize of 16, groupsize of 1, and ASCII output included. - */ -void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len) -{ - print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, - buf, len, true); -} -EXPORT_SYMBOL(print_hex_dump_bytes); -#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */ #endif /* defined(CONFIG_PRINTK) */ -- cgit v1.2.3 From 8495f7e6732ed248b648d36439795b42ec650b9e Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 25 Sep 2019 16:47:27 -0700 Subject: fork: improve error message for corrupted page tables When a user process exits, the kernel cleans up the mm_struct of the user process and during cleanup, check_mm() checks the page tables of the user process for corruption (E.g: unexpected page flags set/cleared). For corrupted page tables, the error message printed by check_mm() isn't very clear as it prints the loop index instead of page table type (E.g: Resident file mapping pages vs Resident shared memory pages). The loop index in check_mm() is used to index rss_stat[] which represents individual memory type stats. Hence, instead of printing index, print memory type, thereby improving error message. Without patch: -------------- [ 204.836425] mm/pgtable-generic.c:29: bad p4d 0000000089eb4e92(800000025f941467) [ 204.836544] BUG: Bad rss-counter state mm:00000000f75895ea idx:0 val:2 [ 204.836615] BUG: Bad rss-counter state mm:00000000f75895ea idx:1 val:5 [ 204.836685] BUG: non-zero pgtables_bytes on freeing mm: 20480 With patch: ----------- [ 69.815453] mm/pgtable-generic.c:29: bad p4d 0000000084653642(800000025ca37467) [ 69.815872] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_FILEPAGES val:2 [ 69.815962] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_ANONPAGES val:5 [ 69.816050] BUG: non-zero pgtables_bytes on freeing mm: 20480 Also, change print function (from printk(KERN_ALERT, ..) to pr_alert()) so that it matches the other print statement. Link: http://lkml.kernel.org/r/da75b5153f617f4c5739c08ee6ebeb3d19db0fbc.1565123758.git.sai.praneeth.prakhya@intel.com Signed-off-by: Sai Praneeth Prakhya Reviewed-by: Anshuman Khandual Suggested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types_task.h | 4 ++++ kernel/fork.c | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index d7016dcb245e..c1bc6731125c 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,6 +36,10 @@ struct vmacache { struct vm_area_struct *vmas[VMACACHE_SIZE]; }; +/* + * When updating this, please also update struct resident_page_types[] in + * kernel/fork.c + */ enum { MM_FILEPAGES, /* Resident file mapping pages */ MM_ANONPAGES, /* Resident anonymous pages */ diff --git a/kernel/fork.c b/kernel/fork.c index 5a0fd518e04e..60763c043aa3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) -- cgit v1.2.3 From 2a4a4082cd4438333b5ecffdd15d1a484e5a83c7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 25 Sep 2019 16:47:30 -0700 Subject: cpumask: nicer for_each_cpumask_and() signature Mask arguments can be swapped without changing anything. Make arguments names reflect that: #define for_each_cpu_and(cpu, mask1, mask2) Link: http://lkml.kernel.org/r/20190724183350.GA15041@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5a5a1ed9efd..78a73eba64dd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -200,8 +200,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) -#define for_each_cpu_and(cpu, mask, and) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#define for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask @@ -290,20 +290,20 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator - * @mask: the first cpumask pointer - * @and: the second cpumask pointer + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; - * cpumask_and(&tmp, &mask, &and); + * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ -#define for_each_cpu_and(cpu, mask, and) \ +#define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ - (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ -- cgit v1.2.3 From d5372c39132958679c480d0295dd328c741c7a41 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 25 Sep 2019 16:47:36 -0700 Subject: kexec: restore arch_kexec_kernel_image_probe declaration arch_kexec_kernel_image_probe function declaration has been removed by commit 9ec4ecef0af7 ("kexec_file,x86,powerpc: factor out kexec_file_ops functions"). Still this function is overridden by couple of architectures and proper prototype declaration is therefore important, so bring it back. This fixes the following sparse warning on s390: arch/s390/kernel/machine_kexec_file.c:333:5: warning: symbol 'arch_kexec_kernel_image_probe' was not declared. Should it be static? Link: http://lkml.kernel.org/r/patch.git-ff1c9045ebdc.your-ad-here.call-01564402297-ext-5690@work.hours Signed-off-by: Vasily Gorbik Acked-by: Dave Young Reviewed-by: Bhupesh Sharma Cc: Eric Biederman Cc: AKASHI Takahiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0b809258ed3..cc162f3e6461 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -183,6 +183,8 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); void * __weak arch_kexec_kernel_image_load(struct kimage *image); int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, -- cgit v1.2.3 From 9dd819a15162f8f82a6001b090caa38c18297b39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:39 -0700 Subject: uaccess: add missing __must_check attributes The usercopy implementation comments describe that callers of the copy_*_user() family of functions must always have their return values checked. This can be enforced at compile time with __must_check, so add it where needed. Link: http://lkml.kernel.org/r/201908251609.ADAD5CAAC1@keescook Signed-off-by: Kees Cook Cc: Alexander Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 +- include/linux/uaccess.h | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 8d8821b3689a..659a4400517b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -134,7 +134,7 @@ static inline void copy_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static __always_inline bool +static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 34a038563d97..70bbdc38dc37 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -55,7 +55,7 @@ * as usual) and both source and destination can trigger faults. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); @@ -63,7 +63,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) return raw_copy_from_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); @@ -85,7 +85,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); @@ -93,7 +93,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -103,7 +103,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) } #ifdef INLINE_COPY_FROM_USER -static inline unsigned long +static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; @@ -117,12 +117,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return res; } #else -extern unsigned long +extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER -static inline unsigned long +static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -133,7 +133,7 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) return n; } #else -extern unsigned long +extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif @@ -222,8 +222,9 @@ static inline bool pagefault_disabled(void) #ifndef ARCH_HAS_NOCACHE_UACCESS -static inline unsigned long __copy_from_user_inatomic_nocache(void *to, - const void __user *from, unsigned long n) +static inline __must_check unsigned long +__copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) { return __copy_from_user_inatomic(to, from, n); } -- cgit v1.2.3 From 7d92bda271ddcbb2d1be2f82733dcb9bf8378010 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:45 -0700 Subject: kgdb: don't use a notifier to enter kgdb at panic; call directly Right now kgdb/kdb hooks up to debug panics by registering for the panic notifier. This works OK except that it means that kgdb/kdb gets called _after_ the CPUs in the system are taken offline. That means that if anything important was happening on those CPUs (like something that might have contributed to the panic) you can't debug them. Specifically I ran into a case where I got a panic because a task was "blocked for more than 120 seconds" which was detected on CPU 2. I nicely got shown stack traces in the kernel log for all CPUs including CPU 0, which was running 'PID: 111 Comm: kworker/0:1H' and was in the middle of __mmc_switch(). I then ended up at the kdb prompt where switched over to kgdb to try to look at local variables of the process on CPU 0. I found that I couldn't. Digging more, I found that I had no info on any tasks running on CPUs other than CPU 2 and that asking kdb for help showed me "Error: no saved data for this cpu". This was because all the CPUs were offline. Let's move the entry of kdb/kgdb to a direct call from panic() and stop using the generic notifier. Putting a direct call in allows us to order things more properly and it also doesn't seem like we're breaking any abstractions by calling into the debugger from the panic function. Daniel said: : This patch changes the way kdump and kgdb interact with each other. : However it would seem rather odd to have both tools simultaneously armed : and, even if they were, the user still has the option to use panic_timeout : to force a kdump to happen. Thus I think the change of order is : acceptable. Link: http://lkml.kernel.org/r/20190703170354.217312-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Kees Cook Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Feng Tang Cc: YueHaibing Cc: Sergey Senozhatsky Cc: "Steven Rostedt (VMware)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ kernel/debug/debug_core.c | 31 +++++++++++-------------------- kernel/panic.c | 8 ++++++++ 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index fbf144aaa749..b072aeb1fd78 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -326,8 +326,10 @@ extern atomic_t kgdb_active; (raw_smp_processor_id() == atomic_read(&kgdb_active)) extern bool dbg_is_early; extern void __init dbg_late_init(void); +extern void kgdb_panic(const char *msg); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() +static inline void kgdb_panic(const char *msg) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10f1187b3907..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -893,30 +893,25 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); + kdb_printf("PANIC: %s\n", msg); + kgdb_breakpoint(); - return NOTIFY_DONE; } -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; - void __weak kgdb_arch_late(void) { } @@ -965,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -980,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..d1ece4c363b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,13 @@ void panic(const char *fmt, ...) dump_stack(); #endif + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. -- cgit v1.2.3 From ee8711336c51708382627ebcaee5f2122b77dfef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:52 -0700 Subject: bug: refactor away warn_slowpath_fmt_taint() Patch series "Clean up WARN() "cut here" handling", v2. Christophe Leroy noticed that the fix for missing "cut here" in the WARN() case was adding explicit printk() calls instead of teaching the exception handler to add it. This refactors the bug/warn infrastructure to pass this information as a new BUGFLAG. Longer details repeated from the last patch in the series: bug: move WARN_ON() "cut here" into exception handler The original cleanup of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. This patch (of 7): There's no reason to have specialized helpers for passing the warn taint down to __warn(). Consolidate and refactor helper macros, removing __WARN_printf() and warn_slowpath_fmt_taint(). Link: http://lkml.kernel.org/r/20190819234111.9019-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 13 ++++--------- kernel/panic.c | 18 +++--------------- 2 files changed, 7 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7357a3c942a0..c3a9c16a2b69 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -90,24 +90,19 @@ struct bug_entry { * Use the versions with printk format strings to provide better diagnostics. */ #ifndef __WARN_TAINT -extern __printf(3, 4) -void warn_slowpath_fmt(const char *file, const int line, - const char *fmt, ...); extern __printf(4, 5) -void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint, - const char *fmt, ...); +void warn_slowpath_fmt(const char *file, const int line, unsigned taint, + const char *fmt, ...); extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf(arg...) warn_slowpath_fmt(__FILE__, __LINE__, arg) #define __WARN_printf_taint(taint, arg...) \ - warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg) #define __WARN_printf_taint(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -132,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf(format); \ + __WARN_printf_taint(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif diff --git a/kernel/panic.c b/kernel/panic.c index d1ece4c363b9..1d89f5423426 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -600,20 +600,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ - struct warn_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; @@ -622,7 +610,7 @@ void warn_slowpath_fmt_taint(const char *file, int line, __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); +EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { -- cgit v1.2.3 From 89348fc31441f0270b040fbeb68b6d7d13504f36 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:55 -0700 Subject: bug: rename __WARN_printf_taint() to __WARN_printf() This just renames the helper to improve readability. Link: http://lkml.kernel.org/r/20190819234111.9019-3-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c3a9c16a2b69..2d35bbf687d0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -96,14 +96,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -127,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(TAINT_WARN, format); \ + __WARN_printf(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif @@ -135,7 +135,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN_TAINT(condition, taint, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(taint, format); \ + __WARN_printf(taint, format); \ unlikely(__ret_warn_on); \ }) -- cgit v1.2.3 From f2f84b05e02b7710a201f0017b3272ad7ef703d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:58 -0700 Subject: bug: consolidate warn_slowpath_fmt() usage Instead of having a separate helper for no printk output, just consolidate the logic into warn_slowpath_fmt(). Link: http://lkml.kernel.org/r/20190819234111.9019-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 3 +-- kernel/panic.c | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2d35bbf687d0..598d7072602f 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -93,9 +93,8 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH -#define __WARN() warn_slowpath_null(__FILE__, __LINE__) +#define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else diff --git a/kernel/panic.c b/kernel/panic.c index 1d89f5423426..79c153951b59 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,19 +605,19 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + if (!fmt) { + pr_warn(CUT_HERE); + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } + args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); #else void __warn_printk(const char *fmt, ...) { -- cgit v1.2.3 From d4bce140b4e739bceb4e239d4842cf8f346c1e0f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:04 -0700 Subject: bug: clean up helper macros to remove __WARN_TAINT() In preparation for cleaning up "cut here" even more, this removes the __WARN_*TAINT() helpers, as they limit the ability to add new BUGFLAG_* flags to call sites. They are removed by expanding them into full __WARN_FLAGS() calls. Link: http://lkml.kernel.org/r/20190819234111.9019-6-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 598d7072602f..4b18e09094cf 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -62,13 +62,11 @@ struct bug_entry { #endif #ifdef __WARN_FLAGS -#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) -#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) - #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_ONCE_TAINT(TAINT_WARN); \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) #endif @@ -89,7 +87,7 @@ struct bug_entry { * * Use the versions with printk format strings to provide better diagnostics. */ -#ifndef __WARN_TAINT +#ifndef __WARN_FLAGS extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); @@ -99,11 +97,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ -} while (0) -#define __WARN_printf(taint, arg...) \ - do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) +#define __WARN() do { \ + printk(KERN_WARNING CUT_HERE); \ + __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ + } while (0) +#define __WARN_printf(taint, arg...) do { \ + __warn_printk(arg); \ + __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + } while (0) #endif /* used internally by panic.c */ -- cgit v1.2.3 From 2da1ead4d5f7fa5f61e5805655de1e245d03a763 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:08 -0700 Subject: bug: consolidate __WARN_FLAGS usage Instead of having separate tests for __WARN_FLAGS, merge the two #ifdef blocks and replace the synonym WANT_WARN_ON_SLOWPATH macro. Link: http://lkml.kernel.org/r/20190819234111.9019-7-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 18 +++++++----------- kernel/panic.c | 2 +- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 4b18e09094cf..b4a2639130a0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -61,16 +61,6 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif -#ifdef __WARN_FLAGS -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ - BUGFLAG_TAINT(TAINT_WARN)); \ - unlikely(__ret_warn_on); \ -}) -#endif - /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant kernel issues that need prompt attention if they should ever @@ -91,7 +81,6 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -#define WANT_WARN_ON_SLOWPATH #define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) @@ -105,6 +94,13 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); __warn_printk(arg); \ __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ } while (0) +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) #endif /* used internally by panic.c */ diff --git a/kernel/panic.c b/kernel/panic.c index a643e5464296..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -596,7 +596,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH +#ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { -- cgit v1.2.3 From a44f71a9ab99b509fec9d5a9f5c222debd89934f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:11 -0700 Subject: bug: move WARN_ON() "cut here" into exception handler The original clean up of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. Link: http://lkml.kernel.org/r/201908200943.601DD59DCE@keescook Fixes: 6b15f678fb7d ("include/asm-generic/bug.h: fix "cut here" for WARN_ON for __WARN_TAINT architectures") Signed-off-by: Kees Cook Reported-by: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 +++----- lib/bug.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index b4a2639130a0..384b5c835ced 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -10,6 +10,7 @@ #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) +#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -86,13 +87,10 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); \ - __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ - } while (0) +#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/lib/bug.c b/lib/bug.c index 1077366f496b..8c98af0bf585 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -181,6 +181,15 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) } } + /* + * BUG() and WARN_ON() families don't print a custom debug message + * before triggering the exception handler, so we must add the + * "cut here" line now. WARN() issues its own "cut here" before the + * extra debugging message it writes before triggering the handler. + */ + if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0) + printk(KERN_DEFAULT CUT_HERE); + if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, @@ -188,8 +197,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_DEFAULT CUT_HERE); - if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); else -- cgit v1.2.3 From 9c276cc65a58faf98be8e56962745ec99ab87636 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:08 -0700 Subject: mm: introduce MADV_COLD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce MADV_COLD and MADV_PAGEOUT", v7. - Background The Android terminology used for forking a new process and starting an app from scratch is a cold start, while resuming an existing app is a hot start. While we continually try to improve the performance of cold starts, hot starts will always be significantly less power hungry as well as faster so we are trying to make hot start more likely than cold start. To increase hot start, Android userspace manages the order that apps should be killed in a process called ActivityManagerService. ActivityManagerService tracks every Android app or service that the user could be interacting with at any time and translates that into a ranked list for lmkd(low memory killer daemon). They are likely to be killed by lmkd if the system has to reclaim memory. In that sense they are similar to entries in any other cache. Those apps are kept alive for opportunistic performance improvements but those performance improvements will vary based on the memory requirements of individual workloads. - Problem Naturally, cached apps were dominant consumers of memory on the system. However, they were not significant consumers of swap even though they are good candidate for swap. Under investigation, swapping out only begins once the low zone watermark is hit and kswapd wakes up, but the overall allocation rate in the system might trip lmkd thresholds and cause a cached process to be killed(we measured performance swapping out vs. zapping the memory by killing a process. Unsurprisingly, zapping is 10x times faster even though we use zram which is much faster than real storage) so kill from lmkd will often satisfy the high zone watermark, resulting in very few pages actually being moved to swap. - Approach The approach we chose was to use a new interface to allow userspace to proactively reclaim entire processes by leveraging platform information. This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages that are known to be cold from userspace and to avoid races with lmkd by reclaiming apps as soon as they entered the cached state. Additionally, it could provide many chances for platform to use much information to optimize memory efficiency. To achieve the goal, the patchset introduce two new options for madvise. One is MADV_COLD which will deactivate activated pages and the other is MADV_PAGEOUT which will reclaim private pages instantly. These new options complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way that it hints the kernel that memory region is not currently needed and should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way that it hints the kernel that memory region is not currently needed and should be reclaimed when memory pressure rises. This patch (of 5): When a process expects no accesses to a certain memory range, it could give a hint to kernel that the pages can be reclaimed when memory pressure happens but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_COLD hint to madvise(2) syscall. MADV_COLD can be used by a process to mark a memory range as not expected to be used in the near future. The hint can help kernel in deciding which pages to evict early during memory pressure. It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves active file page -> inactive file LRU active anon page -> inacdtive anon LRU Unlike MADV_FREE, it doesn't move active anonymous pages to inactive file LRU's head because MADV_COLD is a little bit different symantic. MADV_FREE means it's okay to discard when the memory pressure because the content of the page is *garbage* so freeing such pages is almost zero overhead since we don't need to swap out and access afterward causes just minor fault. Thus, it would make sense to put those freeable pages in inactive file LRU to compete other used-once pages. It makes sense for implmentaion point of view, too because it's not swapbacked memory any longer until it would be re-dirtied. Even, it could give a bonus to make them be reclaimed on swapless system. However, MADV_COLD doesn't mean garbage so reclaiming them requires swap-out/in in the end so it's bigger cost. Since we have designed VM LRU aging based on cost-model, anonymous cold pages would be better to position inactive anon's LRU list, not file LRU. Furthermore, it would help to avoid unnecessary scanning if system doesn't have a swap device. Let's start simpler way without adding complexity at this moment. However, keep in mind, too that it's a caveat that workloads with a lot of pages cache are likely to ignore MADV_COLD on anonymous memory because we rarely age anonymous LRU lists. * man-page material MADV_COLD (since Linux x.x) Pages in the specified regions will be treated as less-recently-accessed compared to pages in the system with similar access frequencies. In contrast to MADV_FREE, the contents of the region are preserved regardless of subsequent writes to pages. MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-2-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Johannes Weiner Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 2 + mm/internal.h | 2 +- mm/madvise.c | 179 ++++++++++++++++++++++++++++++++- mm/oom_kill.c | 2 +- mm/swap.c | 42 ++++++++ 10 files changed, 232 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index ac23379b7a87..f3258fbf03d0 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -68,6 +68,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index c2b40969eb1f..00ad09fc5eb1 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -95,6 +95,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index c98162f494db..eb14e3a7b8f3 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -48,6 +48,8 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_COLD 20 /* deactivate these pages */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index ebbb48842190..f926b00ff11f 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index de2c67a33b7e..0ce997edb8bb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..23431faf0eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/internal.h b/mm/internal.h index e32390802fd3..0d5f720c75ab 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } diff --git a/mm/madvise.c b/mm/madvise.c index 1f8a6fdc6878..e1aee62967c3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: case MADV_FREE: return 0; default: @@ -289,6 +291,176 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + test_and_clear_page_young(page); + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + test_and_clear_page_young(page); + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -493,7 +665,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, int behavior) { *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -515,7 +687,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* @@ -669,6 +841,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -690,6 +864,7 @@ madvise_behavior_valid(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c1d9496b4c43..71e3acea7817 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 784dc1620620..38c3fa4308e2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, file, 0); + } +} static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page) } } +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate @@ -687,6 +728,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); -- cgit v1.2.3 From 1a4e58cce84ee88129d5d49c064bd2852b481357 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:15 -0700 Subject: mm: introduce MADV_PAGEOUT When a process expects no accesses to a certain memory range for a long time, it could hint kernel that the pages can be reclaimed instantly but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall. MADV_PAGEOUT can be used by a process to mark a memory range as not expected to be used for a long time so that kernel reclaims *any LRU* pages instantly. The hint can help kernel in deciding which pages to evict proactively. A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit intentionally because it's automatically bounded by PMD size. If PMD size(e.g., 256) makes some trouble, we could fix it later by limit it to SWAP_CLUSTER_MAX[1]. - man-page material MADV_PAGEOUT (since Linux x.x) Do not expect access in the near future so pages in the specified regions could be reclaimed instantly regardless of memory pressure. Thus, access in the range after successful operation could cause major page fault but never lose the up-to-date contents unlike MADV_DONTNEED. Pages belonging to a shared mapping are only processed if a write access is allowed for the calling process. MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/ [minchan@kernel.org: clear PG_active on MADV_PAGEOUT] Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/madvise.c | 189 +++++++++++++++++++++++++++++++++ mm/vmscan.c | 56 ++++++++++ 8 files changed, 251 insertions(+) (limited to 'include') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f3258fbf03d0..a18ec7f63888 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -69,6 +69,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 00ad09fc5eb1..57dc2ac4f8bd 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -96,6 +96,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index eb14e3a7b8f3..6fd8871e4081 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -49,6 +49,7 @@ #define MADV_DOFORK 11 /* do inherit across fork */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index f926b00ff11f..e5e643752947 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -104,6 +104,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ce997edb8bb..063c0c1e112b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -365,6 +365,7 @@ extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; +extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 23431faf0eb6..c160a5354eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -68,6 +68,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index e1aee62967c3..54c5639774b6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + LIST_HEAD(page_list); + struct page *page; + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); +huge_unlock: + spin_unlock(ptl); + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior) case MADV_DONTNEED: case MADV_FREE: case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/vmscan.c b/mm/vmscan.c index d8bbaf068c35..e5d52d6a24af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. -- cgit v1.2.3 From b4ed71f557e458257e0f71b11969954acb389240 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 25 Sep 2019 16:49:46 -0700 Subject: mm: treewide: clarify pgtable_page_{ctor,dtor}() naming The naming of pgtable_page_{ctor,dtor}() seems to have confused a few people, and until recently arm64 used these erroneously/pointlessly for other levels of page table. To make it incredibly clear that these only apply to the PTE level, and to align with the naming of pgtable_pmd_page_{ctor,dtor}(), let's rename them to pgtable_pte_page_{ctor,dtor}(). These changes were generated with the following shell script: ---- git grep -lw 'pgtable_page_.tor' | while read FILE; do sed -i '{s/pgtable_page_ctor/pgtable_pte_page_ctor/}' $FILE; sed -i '{s/pgtable_page_dtor/pgtable_pte_page_dtor/}' $FILE; done ---- ... with the documentation re-flowed to remain under 80 columns, and whitespace fixed up in macros to keep backslashes aligned. There should be no functional change as a result of this patch. Link: http://lkml.kernel.org/r/20190722141133.3116-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Mike Rapoport Acked-by: Geert Uytterhoeven [m68k] Cc: Anshuman Khandual Cc: Matthew Wilcox Cc: Michal Hocko Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/split_page_table_lock.rst | 10 +++++----- arch/arc/include/asm/pgalloc.h | 4 ++-- arch/arm/include/asm/tlb.h | 2 +- arch/arm/mm/mmu.c | 2 +- arch/arm64/include/asm/tlb.h | 2 +- arch/arm64/mm/mmu.c | 2 +- arch/csky/include/asm/pgalloc.h | 2 +- arch/hexagon/include/asm/pgalloc.h | 2 +- arch/m68k/include/asm/mcf_pgalloc.h | 6 +++--- arch/m68k/include/asm/motorola_pgalloc.h | 6 +++--- arch/m68k/include/asm/sun3_pgalloc.h | 2 +- arch/mips/include/asm/pgalloc.h | 2 +- arch/nios2/include/asm/pgalloc.h | 2 +- arch/openrisc/include/asm/pgalloc.h | 6 +++--- arch/powerpc/mm/pgtable-frag.c | 6 +++--- arch/riscv/include/asm/pgalloc.h | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- arch/sh/include/asm/pgalloc.h | 2 +- arch/sparc/mm/init_64.c | 4 ++-- arch/sparc/mm/srmmu.c | 4 ++-- arch/um/include/asm/pgalloc.h | 2 +- arch/unicore32/include/asm/tlb.h | 2 +- arch/x86/mm/pgtable.c | 2 +- arch/xtensa/include/asm/pgalloc.h | 4 ++-- include/asm-generic/pgalloc.h | 8 ++++---- include/linux/mm.h | 4 ++-- 26 files changed, 48 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst index 889b00be469f..ff51f4a5494d 100644 --- a/Documentation/vm/split_page_table_lock.rst +++ b/Documentation/vm/split_page_table_lock.rst @@ -54,9 +54,9 @@ Hugetlb-specific helpers: Support of split page table lock by an architecture =================================================== -There's no need in special enabling of PTE split page table lock: -everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), -which must be called on PTE table allocation / freeing. +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table allocation: slab uses page->slab_cache for its pages. @@ -74,7 +74,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. -NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must be handled properly. page->ptl @@ -94,7 +94,7 @@ trick: split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs one more cache line for indirect access; -The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in pgtable_pmd_page_ctor() for PMD table. Please, never access page->ptl directly -- use appropriate helper. diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 4751f2251cd9..b747f2ec2928 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -108,7 +108,7 @@ pte_alloc_one(struct mm_struct *mm) return 0; memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t)); page = virt_to_page(pte_pg); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return 0; } @@ -123,7 +123,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) { - pgtable_page_dtor(virt_to_page(ptep)); + pgtable_pte_page_dtor(virt_to_page(ptep)); free_pages((unsigned long)ptep, __get_order_pte()); } diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b75ea15b85c0..669474add486 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); #ifndef CONFIG_ARM_LPAE /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 25da9b2d9610..48c2888297dd 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -731,7 +731,7 @@ static void *__init late_alloc(unsigned long sz) { void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) BUG(); return ptr; } diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a95d1fcb7e21..b76df828e6b7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); tlb_remove_table(tlb, pte); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 53dc6f24cfb7..60c929f3683b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -384,7 +384,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * folded, and if so pgtable_pmd_page_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); else if (shift == PMD_SHIFT) BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index d089113fe41f..c7c1ed27e348 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -71,7 +71,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page(tlb, pte); \ } while (0) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 5a6e79e7926d..cc9be514a676 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -94,7 +94,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor((pte)); \ + pgtable_pte_page_dtor((pte)); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 4399d712f6db..b34d44d666a4 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -41,7 +41,7 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -54,7 +54,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -73,7 +73,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, struct page *page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d04d9ba9b976..acab315c851f 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -36,7 +36,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if(!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -51,7 +51,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, pgtable_t page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); @@ -60,7 +60,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1a8ddbd0d23c..856121122b91 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -21,7 +21,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa73cb187a07..166842337eb2 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -54,7 +54,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb,pte,address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 750d18d5980b..0b146d773c85 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -41,7 +41,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 787c1b9d2f6d..da12a4c38c4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -75,7 +75,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; clear_page(page_address(pte)); - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -89,13 +89,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index f66a00d8cb19..d59ea92285ec 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -78,7 +78,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pte_free_tlb(tlb, pte, buf) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 54fcdf66ae96..3dd253f81a77 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -210,7 +210,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -256,7 +256,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) atomic_xor_bits(&page->_refcount, 3U << 24); } - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -308,7 +308,7 @@ void __tlb_remove_table(void *_table) case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); break; } diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8c6341a4d807..22d968bfe9bb 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 4b099dd7a767..e6d91819da92 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2903,7 +2903,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { free_unref_page(page); return NULL; } @@ -2919,7 +2919,7 @@ static void __pte_free(pgtable_t pte) { struct page *page = virt_to_page(pte); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index aaebbc00d262..cc3ad64479ac 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -378,7 +378,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -389,7 +389,7 @@ void pte_free(struct mm_struct *mm, pgtable_t pte) { unsigned long p; - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); p = (unsigned long)page_address(pte); /* Cached address (for test) */ if (p == 0) BUG(); diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 446e0c0f4018..881e76da1938 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -29,7 +29,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); #define __pte_free_tlb(tlb,pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ } while (0) diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 10d2356bfddd..4663d8cc80ef 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -15,7 +15,7 @@ #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 463940faf52f..3e4b9035bb9a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index dd744aa450fa..1d38f0e755ba 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -55,7 +55,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -69,7 +69,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 6f8cc06ee44e..73f7421413cb 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. @@ -63,7 +63,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) pte = alloc_page(gfp); if (!pte) return NULL; - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -76,7 +76,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * Return: `struct page` initialized as page table or %NULL on error */ @@ -98,7 +98,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { - pgtable_page_dtor(pte_page); + pgtable_pte_page_dtor(pte_page); __free_page(pte_page); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 294a67b94147..cc292273e6ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline void pgtable_init(void) pgtable_cache_init(); } -static inline bool pgtable_page_ctor(struct page *page) +static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; @@ -1958,7 +1958,7 @@ static inline bool pgtable_page_ctor(struct page *page) return true; } -static inline void pgtable_page_dtor(struct page *page) +static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); -- cgit v1.2.3