From afaef01c001537fa97a25092d7f54d764dc7d8c1 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:16:58 +0300 Subject: x86/entry: Add STACKLEAK erasing the kernel stack at the end of syscalls The STACKLEAK feature (initially developed by PaX Team) has the following benefits: 1. Reduces the information that can be revealed through kernel stack leak bugs. The idea of erasing the thread stack at the end of syscalls is similar to CONFIG_PAGE_POISONING and memzero_explicit() in kernel crypto, which all comply with FDP_RIP.2 (Full Residual Information Protection) of the Common Criteria standard. 2. Blocks some uninitialized stack variable attacks (e.g. CVE-2017-17712, CVE-2010-2963). That kind of bugs should be killed by improving C compilers in future, which might take a long time. This commit introduces the code filling the used part of the kernel stack with a poison value before returning to userspace. Full STACKLEAK feature also contains the gcc plugin which comes in a separate commit. The STACKLEAK feature is ported from grsecurity/PaX. More information at: https://grsecurity.net/ https://pax.grsecurity.net/ This code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on our understanding of the code. Changes or omissions from the original code are ours and don't reflect the original grsecurity/PaX code. Performance impact: Hardware: Intel Core i7-4770, 16 GB RAM Test #1: building the Linux kernel on a single core 0.91% slowdown Test #2: hackbench -s 4096 -l 2000 -g 15 -f 25 -P 4.2% slowdown So the STACKLEAK description in Kconfig includes: "The tradeoff is the performance impact: on a single CPU system kernel compilation sees a 1% slowdown, other systems and workloads may vary and you are advised to test this feature on your expected workload before deploying it". Signed-off-by: Alexander Popov Acked-by: Thomas Gleixner Reviewed-by: Dave Hansen Acked-by: Ingo Molnar Signed-off-by: Kees Cook --- kernel/Makefile | 4 ++++ kernel/fork.c | 3 +++ kernel/stackleak.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 kernel/stackleak.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 7a63d567fdb5..7343b3a9bff0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -117,6 +117,10 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o +KASAN_SANITIZE_stackleak.o := n +KCOV_INSTRUMENT_stackleak.o := n + $(obj)/configs.o: $(obj)/config_data.h targets += config_data.gz diff --git a/kernel/fork.c b/kernel/fork.c index d896e9ca38b0..47911e49c2b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include @@ -1880,6 +1881,8 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + stackleak_task_init(p); + if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { diff --git a/kernel/stackleak.c b/kernel/stackleak.c new file mode 100644 index 000000000000..deba0d8992f9 --- /dev/null +++ b/kernel/stackleak.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code fills the used part of the kernel stack with a poison value + * before returning to userspace. It's part of the STACKLEAK feature + * ported from grsecurity/PaX. + * + * Author: Alexander Popov + * + * STACKLEAK reduces the information which kernel stack leak bugs can + * reveal and blocks some uninitialized stack variable attacks. + */ + +#include + +asmlinkage void stackleak_erase(void) +{ + /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ + unsigned long kstack_ptr = current->lowest_stack; + unsigned long boundary = (unsigned long)end_of_stack(current); + unsigned int poison_count = 0; + const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + + /* Check that 'lowest_stack' value is sane */ + if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) + kstack_ptr = boundary; + + /* Search for the poison value in the kernel stack */ + while (kstack_ptr > boundary && poison_count <= depth) { + if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) + poison_count++; + else + poison_count = 0; + + kstack_ptr -= sizeof(unsigned long); + } + + /* + * One 'long int' at the bottom of the thread stack is reserved and + * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). + */ + if (kstack_ptr == boundary) + kstack_ptr += sizeof(unsigned long); + + /* + * Now write the poison value to the kernel stack. Start from + * 'kstack_ptr' and move up till the new 'boundary'. We assume that + * the stack pointer doesn't change when we write poison. + */ + if (on_thread_stack()) + boundary = current_stack_pointer; + else + boundary = current_top_of_stack(); + + while (kstack_ptr < boundary) { + *(unsigned long *)kstack_ptr = STACKLEAK_POISON; + kstack_ptr += sizeof(unsigned long); + } + + /* Reset the 'lowest_stack' value for the next syscall */ + current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; +} + -- cgit v1.2.3 From 10e9ae9fabaf96c8e5227c1cd4827d58b3aa406d Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:16:59 +0300 Subject: gcc-plugins: Add STACKLEAK plugin for tracking the kernel stack The STACKLEAK feature erases the kernel stack before returning from syscalls. That reduces the information which kernel stack leak bugs can reveal and blocks some uninitialized stack variable attacks. This commit introduces the STACKLEAK gcc plugin. It is needed for tracking the lowest border of the kernel stack, which is important for the code erasing the used part of the kernel stack at the end of syscalls (comes in a separate commit). The STACKLEAK feature is ported from grsecurity/PaX. More information at: https://grsecurity.net/ https://pax.grsecurity.net/ This code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on our understanding of the code. Changes or omissions from the original code are ours and don't reflect the original grsecurity/PaX code. Signed-off-by: Alexander Popov Tested-by: Laura Abbott Signed-off-by: Kees Cook --- kernel/stackleak.c | 28 +++ scripts/Makefile.gcc-plugins | 10 + scripts/gcc-plugins/Kconfig | 12 + scripts/gcc-plugins/stackleak_plugin.c | 427 +++++++++++++++++++++++++++++++++ 4 files changed, 477 insertions(+) create mode 100644 scripts/gcc-plugins/stackleak_plugin.c (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index deba0d8992f9..628485db37ba 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -60,3 +60,31 @@ asmlinkage void stackleak_erase(void) current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } +void __used stackleak_track_stack(void) +{ + /* + * N.B. stackleak_erase() fills the kernel stack with the poison value, + * which has the register width. That code assumes that the value + * of 'lowest_stack' is aligned on the register width boundary. + * + * That is true for x86 and x86_64 because of the kernel stack + * alignment on these platforms (for details, see 'cc_stack_align' in + * arch/x86/Makefile). Take care of that when you port STACKLEAK to + * new platforms. + */ + unsigned long sp = (unsigned long)&sp; + + /* + * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than + * STACKLEAK_SEARCH_DEPTH makes the poison search in + * stackleak_erase() unreliable. Let's prevent that. + */ + BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + + if (sp < current->lowest_stack && + sp >= (unsigned long)task_stack_page(current) + + sizeof(unsigned long)) { + current->lowest_stack = sp; + } +} +EXPORT_SYMBOL(stackleak_track_stack); diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 0a482f341576..46c5c6809806 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -26,6 +26,16 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT) \ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_RANDSTRUCT_PERFORMANCE) \ += -fplugin-arg-randomize_layout_plugin-performance-mode +gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak_plugin.so +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -DSTACKLEAK_PLUGIN +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE) +ifdef CONFIG_GCC_PLUGIN_STACKLEAK + DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable +endif +export DISABLE_STACKLEAK_PLUGIN + # All the plugin CFLAGS are collected here in case a build target needs to # filter them out of the KBUILD_CFLAGS. GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y)) diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index 977b84e69787..c65fdd823591 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -158,4 +158,16 @@ config GCC_PLUGIN_STACKLEAK * https://grsecurity.net/ * https://pax.grsecurity.net/ +config STACKLEAK_TRACK_MIN_SIZE + int "Minimum stack frame size of functions tracked by STACKLEAK" + default 100 + range 0 4096 + depends on GCC_PLUGIN_STACKLEAK + help + The STACKLEAK gcc plugin instruments the kernel code for tracking + the lowest border of the kernel stack (and for some other purposes). + It inserts the stackleak_track_stack() call for the functions with + a stack frame size greater than or equal to this parameter. + If unsure, leave the default value 100. + endif diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c new file mode 100644 index 000000000000..2f48da98b5d4 --- /dev/null +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -0,0 +1,427 @@ +/* + * Copyright 2011-2017 by the PaX Team + * Modified by Alexander Popov + * Licensed under the GPL v2 + * + * Note: the choice of the license means that the compilation process is + * NOT 'eligible' as defined by gcc's library exception to the GPL v3, + * but for the kernel it doesn't matter since it doesn't link against + * any of the gcc libraries + * + * This gcc plugin is needed for tracking the lowest border of the kernel stack. + * It instruments the kernel code inserting stackleak_track_stack() calls: + * - after alloca(); + * - for the functions with a stack frame size greater than or equal + * to the "track-min-size" plugin parameter. + * + * This plugin is ported from grsecurity/PaX. For more information see: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + * + * Debugging: + * - use fprintf() to stderr, debug_generic_expr(), debug_gimple_stmt(), + * print_rtl() and print_simple_rtl(); + * - add "-fdump-tree-all -fdump-rtl-all" to the plugin CFLAGS in + * Makefile.gcc-plugins to see the verbose dumps of the gcc passes; + * - use gcc -E to understand the preprocessing shenanigans; + * - use gcc with enabled CFG/GIMPLE/SSA verification (--enable-checking). + */ + +#include "gcc-common.h" + +__visible int plugin_is_GPL_compatible; + +static int track_frame_size = -1; +static const char track_function[] = "stackleak_track_stack"; + +/* + * Mark these global variables (roots) for gcc garbage collector since + * they point to the garbage-collected memory. + */ +static GTY(()) tree track_function_decl; + +static struct plugin_info stackleak_plugin_info = { + .version = "201707101337", + .help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n" + "disable\t\tdo not activate the plugin\n" +}; + +static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) +{ + gimple stmt; + gcall *stackleak_track_stack; + cgraph_node_ptr node; + int frequency; + basic_block bb; + + /* Insert call to void stackleak_track_stack(void) */ + stmt = gimple_build_call(track_function_decl, 0); + stackleak_track_stack = as_a_gcall(stmt); + if (after) { + gsi_insert_after(gsi, stackleak_track_stack, + GSI_CONTINUE_LINKING); + } else { + gsi_insert_before(gsi, stackleak_track_stack, GSI_SAME_STMT); + } + + /* Update the cgraph */ + bb = gimple_bb(stackleak_track_stack); + node = cgraph_get_create_node(track_function_decl); + gcc_assert(node); + frequency = compute_call_stmt_bb_frequency(current_function_decl, bb); + cgraph_create_edge(cgraph_get_node(current_function_decl), node, + stackleak_track_stack, bb->count, frequency); +} + +static bool is_alloca(gimple stmt) +{ + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA)) + return true; + +#if BUILDING_GCC_VERSION >= 4007 + if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) + return true; +#endif + + return false; +} + +/* + * Work with the GIMPLE representation of the code. Insert the + * stackleak_track_stack() call after alloca() and into the beginning + * of the function if it is not instrumented. + */ +static unsigned int stackleak_instrument_execute(void) +{ + basic_block bb, entry_bb; + bool prologue_instrumented = false, is_leaf = true; + gimple_stmt_iterator gsi; + + /* + * ENTRY_BLOCK_PTR is a basic block which represents possible entry + * point of a function. This block does not contain any code and + * has a CFG edge to its successor. + */ + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + entry_bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + + /* + * Loop through the GIMPLE statements in each of cfun basic blocks. + * cfun is a global variable which represents the function that is + * currently processed. + */ + FOR_EACH_BB_FN(bb, cfun) { + for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { + gimple stmt; + + stmt = gsi_stmt(gsi); + + /* Leaf function is a function which makes no calls */ + if (is_gimple_call(stmt)) + is_leaf = false; + + if (!is_alloca(stmt)) + continue; + + /* Insert stackleak_track_stack() call after alloca() */ + stackleak_add_track_stack(&gsi, true); + if (bb == entry_bb) + prologue_instrumented = true; + } + } + + if (prologue_instrumented) + return 0; + + /* + * Special cases to skip the instrumentation. + * + * Taking the address of static inline functions materializes them, + * but we mustn't instrument some of them as the resulting stack + * alignment required by the function call ABI will break other + * assumptions regarding the expected (but not otherwise enforced) + * register clobbering ABI. + * + * Case in point: native_save_fl on amd64 when optimized for size + * clobbers rdx if it were instrumented here. + * + * TODO: any more special cases? + */ + if (is_leaf && + !TREE_PUBLIC(current_function_decl) && + DECL_DECLARED_INLINE_P(current_function_decl)) { + return 0; + } + + if (is_leaf && + !strncmp(IDENTIFIER_POINTER(DECL_NAME(current_function_decl)), + "_paravirt_", 10)) { + return 0; + } + + /* Insert stackleak_track_stack() call at the function beginning */ + bb = entry_bb; + if (!single_pred_p(bb)) { + /* gcc_assert(bb_loop_depth(bb) || + (bb->flags & BB_IRREDUCIBLE_LOOP)); */ + split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + } + gsi = gsi_after_labels(bb); + stackleak_add_track_stack(&gsi, false); + + return 0; +} + +static bool large_stack_frame(void) +{ +#if BUILDING_GCC_VERSION >= 8000 + return maybe_ge(get_frame_size(), track_frame_size); +#else + return (get_frame_size() >= track_frame_size); +#endif +} + +/* + * Work with the RTL representation of the code. + * Remove the unneeded stackleak_track_stack() calls from the functions + * which don't call alloca() and don't have a large enough stack frame size. + */ +static unsigned int stackleak_cleanup_execute(void) +{ + rtx_insn *insn, *next; + + if (cfun->calls_alloca) + return 0; + + if (large_stack_frame()) + return 0; + + /* + * Find stackleak_track_stack() calls. Loop through the chain of insns, + * which is an RTL representation of the code for a function. + * + * The example of a matching insn: + * (call_insn 8 4 10 2 (call (mem (symbol_ref ("stackleak_track_stack") + * [flags 0x41] ) + * [0 stackleak_track_stack S1 A8]) (0)) 675 {*call} (expr_list + * (symbol_ref ("stackleak_track_stack") [flags 0x41] ) (expr_list (0) (nil))) (nil)) + */ + for (insn = get_insns(); insn; insn = next) { + rtx body; + + next = NEXT_INSN(insn); + + /* Check the expression code of the insn */ + if (!CALL_P(insn)) + continue; + + /* + * Check the expression code of the insn body, which is an RTL + * Expression (RTX) describing the side effect performed by + * that insn. + */ + body = PATTERN(insn); + + if (GET_CODE(body) == PARALLEL) + body = XVECEXP(body, 0, 0); + + if (GET_CODE(body) != CALL) + continue; + + /* + * Check the first operand of the call expression. It should + * be a mem RTX describing the needed subroutine with a + * symbol_ref RTX. + */ + body = XEXP(body, 0); + if (GET_CODE(body) != MEM) + continue; + + body = XEXP(body, 0); + if (GET_CODE(body) != SYMBOL_REF) + continue; + + if (SYMBOL_REF_DECL(body) != track_function_decl) + continue; + + /* Delete the stackleak_track_stack() call */ + delete_insn_and_edges(insn); +#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION < 8000 + if (GET_CODE(next) == NOTE && + NOTE_KIND(next) == NOTE_INSN_CALL_ARG_LOCATION) { + insn = next; + next = NEXT_INSN(insn); + delete_insn_and_edges(insn); + } +#endif + } + + return 0; +} + +static bool stackleak_gate(void) +{ + tree section; + + section = lookup_attribute("section", + DECL_ATTRIBUTES(current_function_decl)); + if (section && TREE_VALUE(section)) { + section = TREE_VALUE(TREE_VALUE(section)); + + if (!strncmp(TREE_STRING_POINTER(section), ".init.text", 10)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".devinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".cpuinit.text", 13)) + return false; + if (!strncmp(TREE_STRING_POINTER(section), ".meminit.text", 13)) + return false; + } + + return track_frame_size >= 0; +} + +/* Build the function declaration for stackleak_track_stack() */ +static void stackleak_start_unit(void *gcc_data __unused, + void *user_data __unused) +{ + tree fntype; + + /* void stackleak_track_stack(void) */ + fntype = build_function_type_list(void_type_node, NULL_TREE); + track_function_decl = build_fn_decl(track_function, fntype); + DECL_ASSEMBLER_NAME(track_function_decl); /* for LTO */ + TREE_PUBLIC(track_function_decl) = 1; + TREE_USED(track_function_decl) = 1; + DECL_EXTERNAL(track_function_decl) = 1; + DECL_ARTIFICIAL(track_function_decl) = 1; + DECL_PRESERVE_P(track_function_decl) = 1; +} + +/* + * Pass gate function is a predicate function that gets executed before the + * corresponding pass. If the return value is 'true' the pass gets executed, + * otherwise, it is skipped. + */ +static bool stackleak_instrument_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_instrument +#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg +#define TODO_FLAGS_START TODO_verify_ssa | TODO_verify_flow | TODO_verify_stmts +#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \ + | TODO_update_ssa | TODO_rebuild_cgraph_edges +#include "gcc-generate-gimple-pass.h" + +static bool stackleak_cleanup_gate(void) +{ + return stackleak_gate(); +} + +#define PASS_NAME stackleak_cleanup +#define TODO_FLAGS_FINISH TODO_dump_func +#include "gcc-generate-rtl-pass.h" + +/* + * Every gcc plugin exports a plugin_init() function that is called right + * after the plugin is loaded. This function is responsible for registering + * the plugin callbacks and doing other required initialization. + */ +__visible int plugin_init(struct plugin_name_args *plugin_info, + struct plugin_gcc_version *version) +{ + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + int i = 0; + + /* Extra GGC root tables describing our GTY-ed data */ + static const struct ggc_root_tab gt_ggc_r_gt_stackleak[] = { + { + .base = &track_function_decl, + .nelt = 1, + .stride = sizeof(track_function_decl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + /* + * The stackleak_instrument pass should be executed before the + * "optimized" pass, which is the control flow graph cleanup that is + * performed just before expanding gcc trees to the RTL. In former + * versions of the plugin this new pass was inserted before the + * "tree_profile" pass, which is currently called "profile". + */ + PASS_INFO(stackleak_instrument, "optimized", 1, + PASS_POS_INSERT_BEFORE); + + /* + * The stackleak_cleanup pass should be executed after the + * "reload" pass, when the stack frame size is final. + */ + PASS_INFO(stackleak_cleanup, "reload", 1, PASS_POS_INSERT_AFTER); + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + /* Parse the plugin arguments */ + for (i = 0; i < argc; i++) { + if (!strcmp(argv[i].key, "disable")) + return 0; + + if (!strcmp(argv[i].key, "track-min-size")) { + if (!argv[i].value) { + error(G_("no value supplied for option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + + track_frame_size = atoi(argv[i].value); + if (track_frame_size < 0) { + error(G_("invalid option argument '-fplugin-arg-%s-%s=%s'"), + plugin_name, argv[i].key, argv[i].value); + return 1; + } + } else { + error(G_("unknown option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + } + + /* Give the information about the plugin */ + register_callback(plugin_name, PLUGIN_INFO, NULL, + &stackleak_plugin_info); + + /* Register to be called before processing a translation unit */ + register_callback(plugin_name, PLUGIN_START_UNIT, + &stackleak_start_unit, NULL); + + /* Register an extra GCC garbage collector (GGC) root table */ + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, + (void *)>_ggc_r_gt_stackleak); + + /* + * Hook into the Pass Manager to register new gcc passes. + * + * The stack frame size info is available only at the last RTL pass, + * when it's too late to insert complex code like a function call. + * So we register two gcc passes to instrument every function at first + * and remove the unneeded instrumentation later. + */ + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_instrument_pass_info); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &stackleak_cleanup_pass_info); + + return 0; +} -- cgit v1.2.3 From c8d126275a5fa59394fe17109bdb9812fed296b8 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:17:01 +0300 Subject: fs/proc: Show STACKLEAK metrics in the /proc file system Introduce CONFIG_STACKLEAK_METRICS providing STACKLEAK information about tasks via the /proc file system. In particular, /proc//stack_depth shows the maximum kernel stack consumption for the current and previous syscalls. Although this information is not precise, it can be useful for estimating the STACKLEAK performance impact for your workloads. Suggested-by: Ingo Molnar Signed-off-by: Alexander Popov Tested-by: Laura Abbott Signed-off-by: Kees Cook --- fs/proc/base.c | 18 ++++++++++++++++++ include/linux/sched.h | 1 + include/linux/stackleak.h | 3 +++ kernel/stackleak.c | 4 ++++ scripts/gcc-plugins/Kconfig | 12 ++++++++++++ 5 files changed, 38 insertions(+) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index ccf86f16d9f0..2a238d68610e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2891,6 +2891,21 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_STACKLEAK_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_STACKLEAK_METRICS */ + /* * Thread groups */ @@ -2992,6 +3007,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_STACKLEAK_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a23acd24e7..ae9d10e14b82 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1194,6 +1194,7 @@ struct task_struct { #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; + unsigned long prev_lowest_stack; #endif /* diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index 628c2b947b89..b911b973d328 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -18,6 +18,9 @@ static inline void stackleak_task_init(struct task_struct *t) { t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long); +# ifdef CONFIG_STACKLEAK_METRICS + t->prev_lowest_stack = t->lowest_stack; +# endif } #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 628485db37ba..f66239572c89 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -41,6 +41,10 @@ asmlinkage void stackleak_erase(void) if (kstack_ptr == boundary) kstack_ptr += sizeof(unsigned long); +#ifdef CONFIG_STACKLEAK_METRICS + current->prev_lowest_stack = kstack_ptr; +#endif + /* * Now write the poison value to the kernel stack. Start from * 'kstack_ptr' and move up till the new 'boundary'. We assume that diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index c65fdd823591..b0a015ef5268 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -170,4 +170,16 @@ config STACKLEAK_TRACK_MIN_SIZE a stack frame size greater than or equal to this parameter. If unsure, leave the default value 100. +config STACKLEAK_METRICS + bool "Show STACKLEAK metrics in the /proc file system" + depends on GCC_PLUGIN_STACKLEAK + depends on PROC_FS + help + If this is set, STACKLEAK metrics for every task are available in + the /proc file system. In particular, /proc//stack_depth + shows the maximum kernel stack consumption for the current and + previous syscalls. Although this information is not precise, it + can be useful for estimating the STACKLEAK performance impact for + your workloads. + endif -- cgit v1.2.3 From 964c9dff0091893a9a74a88edf984c6da0b779f7 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:17:03 +0300 Subject: stackleak: Allow runtime disabling of kernel stack erasing Introduce CONFIG_STACKLEAK_RUNTIME_DISABLE option, which provides 'stack_erasing' sysctl. It can be used in runtime to control kernel stack erasing for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. Suggested-by: Ingo Molnar Signed-off-by: Alexander Popov Tested-by: Laura Abbott Signed-off-by: Kees Cook --- Documentation/sysctl/kernel.txt | 18 ++++++++++++++++++ include/linux/stackleak.h | 6 ++++++ kernel/stackleak.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 15 ++++++++++++++- scripts/gcc-plugins/Kconfig | 8 ++++++++ 5 files changed, 84 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 37a679501ddc..1b8775298cf7 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -89,6 +89,7 @@ show up in /proc/sys/kernel: - shmmni - softlockup_all_cpu_backtrace - soft_watchdog +- stack_erasing - stop-a [ SPARC only ] - sysrq ==> Documentation/admin-guide/sysrq.rst - sysctl_writes_strict @@ -987,6 +988,23 @@ detect a hard lockup condition. ============================================================== +stack_erasing + +This parameter can be used to control kernel stack erasing at the end +of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. + +That erasing reduces the information which kernel stack leak bugs +can reveal and blocks some uninitialized stack variable attacks. +The tradeoff is the performance impact: on a single CPU system kernel +compilation sees a 1% slowdown, other systems and workloads may vary. + + 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. + + 1: kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which can be diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index b911b973d328..3d5c3271a9a8 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -22,6 +22,12 @@ static inline void stackleak_task_init(struct task_struct *t) t->prev_lowest_stack = t->lowest_stack; # endif } + +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index f66239572c89..e42892926244 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -12,6 +12,41 @@ #include +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include +#include + +static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); + +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + int state = !static_branch_unlikely(&stack_erasing_bypass); + int prev_state = state; + + table->data = &state; + table->maxlen = sizeof(int); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + state = !!state; + if (ret || !write || state == prev_state) + return ret; + + if (state) + static_branch_disable(&stack_erasing_bypass); + else + static_branch_enable(&stack_erasing_bypass); + + pr_warn("stackleak: kernel stack erasing is %s\n", + state ? "enabled" : "disabled"); + return ret; +} + +#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) +#else +#define skip_erasing() false +#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ + asmlinkage void stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ @@ -20,6 +55,9 @@ asmlinkage void stackleak_erase(void) unsigned int poison_count = 0; const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + if (skip_erasing()) + return; + /* Check that 'lowest_stack' value is sane */ if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) kstack_ptr = boundary; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..3ae223f7b5df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,7 +91,9 @@ #ifdef CONFIG_CHR_DEV_SG #include #endif - +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR #include #endif @@ -1232,6 +1234,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index b0a015ef5268..0d5c799688f0 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -182,4 +182,12 @@ config STACKLEAK_METRICS can be useful for estimating the STACKLEAK performance impact for your workloads. +config STACKLEAK_RUNTIME_DISABLE + bool "Allow runtime disabling of kernel stack erasing" + depends on GCC_PLUGIN_STACKLEAK + help + This option provides 'stack_erasing' sysctl, which can be used in + runtime to control kernel stack erasing for kernels built with + CONFIG_GCC_PLUGIN_STACKLEAK. + endif -- cgit v1.2.3 From 80ffbaa5b1bd98e80e3239a3b8cfda2da433009a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 3 Sep 2018 06:09:34 -0600 Subject: kallsyms: reduce size a little on 64-bit Both kallsyms_num_syms and kallsyms_markers[] don't really need to use unsigned long as their (base) types; unsigned int fully suffices. Signed-off-by: Jan Beulich Signed-off-by: Masahiro Yamada --- kernel/kallsyms.c | 4 ++-- scripts/kallsyms.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 02a0b01380d8..f3a04994e063 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -37,7 +37,7 @@ extern const u8 kallsyms_names[] __weak; * Tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV). */ -extern const unsigned long kallsyms_num_syms +extern const unsigned int kallsyms_num_syms __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base @@ -46,7 +46,7 @@ __attribute__((weak, section(".rodata"))); extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __weak; +extern const unsigned int kallsyms_markers[] __weak; /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index a9186a98a37d..085b6a584fe0 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -405,7 +405,7 @@ static void write_src(void) } output_label("kallsyms_num_syms"); - printf("\tPTR\t%u\n", table_cnt); + printf("\t.long\t%u\n", table_cnt); printf("\n"); /* table of offset markers, that give the offset in the compressed stream @@ -434,7 +434,7 @@ static void write_src(void) output_label("kallsyms_markers"); for (i = 0; i < ((table_cnt + 255) >> 8); i++) - printf("\tPTR\t%d\n", markers[i]); + printf("\t.long\t%u\n", markers[i]); printf("\n"); free(markers); -- cgit v1.2.3 From d6b183eda466415bb5defcf9afe4cb64734839e8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Aug 2018 16:20:28 -0400 Subject: tracing/kprobe: Remove unneeded extra strchr() from create_trace_kprobe() By utilizing a temporary variable, we can avoid adding another call to strchr(). Instead, save the first call to a temp variable, and then use that variable as the reference to set the event variable. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c30032367aab..508396edc56a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -749,10 +749,13 @@ static int create_trace_kprobe(int argc, char **argv) } if (event) { - if (strchr(event, '/')) { + char *slash; + + slash = strchr(event, '/'); + if (slash) { group = event; - event = strchr(group, '/') + 1; - event[-1] = '\0'; + event = slash + 1; + slash[0] = '\0'; if (strlen(group) == 0) { pr_info("Group name is not specified\n"); return -EINVAL; -- cgit v1.2.3 From 1cc33161a83d20b5462b1e93f95d3ce6388079ee Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 20 Aug 2018 10:12:47 +0530 Subject: uprobes: Support SDT markers having reference count (semaphore) Userspace Statically Defined Tracepoints[1] are dtrace style markers inside userspace applications. Applications like PostgreSQL, MySQL, Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc have these markers embedded in them. These markers are added by developer at important places in the code. Each marker source expands to a single nop instruction in the compiled code but there may be additional overhead for computing the marker arguments which expands to couple of instructions. In case the overhead is more, execution of it can be omitted by runtime if() condition when no one is tracing on the marker: if (reference_counter > 0) { Execute marker instructions; } Default value of reference counter is 0. Tracer has to increment the reference counter before tracing on a marker and decrement it when done with the tracing. Implement the reference counter logic in core uprobe. User will be able to use it from trace_uprobe as well as from kernel module. New trace_uprobe definition with reference counter will now be: :[(ref_ctr_offset)] where ref_ctr_offset is an optional field. For kernel module, new variant of uprobe_register() has been introduced: uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer) No new variant for uprobe_unregister() because it's assumed to have only one reference counter for one uprobe. [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation Note: 'reference counter' is called as 'semaphore' in original Dtrace (or Systemtap, bcc and even in ELF) documentation and code. But the term 'semaphore' is misleading in this context. This is just a counter used to hold number of tracers tracing on a marker. This is not really used for any synchronization. So we are calling it a 'reference counter' in kernel / perf code. Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com Reviewed-by: Masami Hiramatsu [Only trace_uprobe.c] Reviewed-by: Oleg Nesterov Reviewed-by: Song Liu Tested-by: Song Liu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- include/linux/uprobes.h | 5 + kernel/events/uprobes.c | 259 ++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_uprobe.c | 38 ++++++- 4 files changed, 293 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index bb9d2084af03..103a48a48872 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -123,6 +123,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); @@ -160,6 +161,10 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { return -ENOSYS; } +static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) +{ + return -ENOSYS; +} static inline int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3207a4d26849..934feb39f6be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,7 @@ struct uprobe { struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; + loff_t ref_ctr_offset; unsigned long flags; /* @@ -88,6 +89,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct delayed_uprobe { + struct list_head list; + struct uprobe *uprobe; + struct mm_struct *mm; +}; + +static DEFINE_MUTEX(delayed_uprobe_lock); +static LIST_HEAD(delayed_uprobe_list); + /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 1; } +static struct delayed_uprobe * +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + list_for_each_entry(du, &delayed_uprobe_list, list) + if (du->uprobe == uprobe && du->mm == mm) + return du; + return NULL; +} + +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + if (delayed_uprobe_check(uprobe, mm)) + return 0; + + du = kzalloc(sizeof(*du), GFP_KERNEL); + if (!du) + return -ENOMEM; + + du->uprobe = uprobe; + du->mm = mm; + list_add(&du->list, &delayed_uprobe_list); + return 0; +} + +static void delayed_uprobe_delete(struct delayed_uprobe *du) +{ + if (WARN_ON(!du)) + return; + list_del(&du->list); + kfree(du); +} + +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + + if (!uprobe && !mm) + return; + + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (uprobe && du->uprobe != uprobe) + continue; + if (mm && du->mm != mm) + continue; + + delayed_uprobe_delete(du); + } +} + +static bool valid_ref_ctr_vma(struct uprobe *uprobe, + struct vm_area_struct *vma) +{ + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); + + return uprobe->ref_ctr_offset && + vma->vm_file && + file_inode(vma->vm_file) == uprobe->inode && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + vma->vm_start <= vaddr && + vma->vm_end > vaddr; +} + +static struct vm_area_struct * +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *tmp; + + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + if (valid_ref_ctr_vma(uprobe, tmp)) + return tmp; + + return NULL; +} + +static int +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) +{ + void *kaddr; + struct page *page; + struct vm_area_struct *vma; + int ret; + short *ptr; + + if (!vaddr || !d) + return -EINVAL; + + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_WRITE, &page, &vma, NULL); + if (unlikely(ret <= 0)) { + /* + * We are asking for 1 page. If get_user_pages_remote() fails, + * it may return 0, in that case we have to return error. + */ + return ret == 0 ? -EBUSY : ret; + } + + kaddr = kmap_atomic(page); + ptr = kaddr + (vaddr & ~PAGE_MASK); + + if (unlikely(*ptr + d < 0)) { + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " + "curr val: %d, delta: %d\n", vaddr, *ptr, d); + ret = -EINVAL; + goto out; + } + + *ptr += d; + ret = 0; +out: + kunmap_atomic(kaddr); + put_page(page); + return ret; +} + +static void update_ref_ctr_warn(struct uprobe *uprobe, + struct mm_struct *mm, short d) +{ + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, + (unsigned long long) uprobe->offset, + (unsigned long long) uprobe->ref_ctr_offset, mm); +} + +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, + short d) +{ + struct vm_area_struct *rc_vma; + unsigned long rc_vaddr; + int ret = 0; + + rc_vma = find_ref_ctr_vma(uprobe, mm); + + if (rc_vma) { + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); + ret = __update_ref_ctr(mm, rc_vaddr, d); + if (ret) + update_ref_ctr_warn(uprobe, mm, d); + + if (d > 0) + return ret; + } + + mutex_lock(&delayed_uprobe_lock); + if (d > 0) + ret = delayed_uprobe_add(uprobe, mm); + else + delayed_uprobe_remove(uprobe, mm); + mutex_unlock(&delayed_uprobe_lock); + + return ret; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { + struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; - int ret; + int ret, is_register, ref_ctr_updated = 0; + + is_register = is_swbp_insn(&opcode); + uprobe = container_of(auprobe, struct uprobe, arch); retry: /* Read the page with vaddr into memory */ @@ -317,6 +491,15 @@ retry: if (ret <= 0) goto put_old; + /* We are going to replace instruction, update ref_ctr. */ + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); + if (ret) + goto put_old; + + ref_ctr_updated = 1; + } + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -337,6 +520,11 @@ put_old: if (unlikely(ret == -EAGAIN)) goto retry; + + /* Revert back reference counter if instruction update failed. */ + if (ret && is_register && ref_ctr_updated) + update_ref_ctr(uprobe, mm, -1); + return ret; } @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) + if (atomic_dec_and_test(&uprobe->ref)) { + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + delayed_uprobe_remove(uprobe, NULL); kfree(uprobe); + } } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -484,7 +679,8 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; @@ -494,6 +690,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = inode; uprobe->offset = offset; + uprobe->ref_ctr_offset = ref_ctr_offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -895,7 +1092,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * else return 0 (success) */ static int __uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) + loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -912,7 +1109,7 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; retry: - uprobe = alloc_uprobe(inode, offset); + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; /* @@ -938,10 +1135,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { - return __uprobe_register(inode, offset, uc); + return __uprobe_register(inode, offset, 0, uc); } EXPORT_SYMBOL_GPL(uprobe_register); +int uprobe_register_refctr(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset, struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, ref_ctr_offset, uc); +} +EXPORT_SYMBOL_GPL(uprobe_register_refctr); + /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. @@ -1060,6 +1264,35 @@ static void build_probe_list(struct inode *inode, spin_unlock(&uprobes_treelock); } +/* @vma contains reference counter, not the probed instruction. */ +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + unsigned long vaddr; + int ret = 0, err = 0; + + mutex_lock(&delayed_uprobe_lock); + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (du->mm != vma->vm_mm || + !valid_ref_ctr_vma(du->uprobe, vma)) + continue; + + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); + if (ret) { + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); + if (!err) + err = ret; + } + delayed_uprobe_delete(du); + } + mutex_unlock(&delayed_uprobe_lock); + return err; +} + /* * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * @@ -1072,7 +1305,15 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (no_uprobe_events() || !valid_vma(vma, true)) + if (no_uprobe_events()) + return 0; + + if (vma->vm_file && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + delayed_ref_ctr_inc(vma); + + if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); @@ -1246,6 +1487,10 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, mm); + mutex_unlock(&delayed_uprobe_lock); + if (!area) return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..147be8523560 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4621,7 +4621,7 @@ static const char readme_msg[] = "place (kretprobe): [:][+]|\n" #endif #ifdef CONFIG_UPROBE_EVENTS - "\t place: :\n" + " place (uprobe): :[(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" "\t fetcharg: %, @
, @[+|-],\n" diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e696667da29a..7b85172beab6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -47,6 +47,7 @@ struct trace_uprobe { struct inode *inode; char *filename; unsigned long offset; + unsigned long ref_ctr_offset; unsigned long nhit; struct trace_probe tp; }; @@ -352,10 +353,10 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename; + char *arg, *event, *group, *filename, *rctr, *rctr_end; char buf[MAX_EVENT_NAME_LEN]; struct path path; - unsigned long offset; + unsigned long offset, ref_ctr_offset; bool is_delete, is_return; int i, ret; @@ -364,6 +365,7 @@ static int create_trace_uprobe(int argc, char **argv) is_return = false; event = NULL; group = NULL; + ref_ctr_offset = 0; /* argc must be >= 1 */ if (argv[0][0] == '-') @@ -438,6 +440,26 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } + /* Parse reference counter offset if specified. */ + rctr = strchr(arg, '('); + if (rctr) { + rctr_end = strchr(rctr, ')'); + if (rctr > rctr_end || *(rctr_end + 1) != 0) { + ret = -EINVAL; + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + + *rctr++ = '\0'; + *rctr_end = '\0'; + ret = kstrtoul(rctr, 0, &ref_ctr_offset); + if (ret) { + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + } + + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; @@ -472,6 +494,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; + tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); @@ -590,6 +613,9 @@ static int probes_seq_show(struct seq_file *m, void *v) trace_event_name(&tu->tp.call), tu->filename, (int)(sizeof(void *) * 2), tu->offset); + if (tu->ref_ctr_offset) + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); + for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -905,7 +931,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (tu->ref_ctr_offset) { + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + } else { + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + } + if (ret) goto err_buffer; -- cgit v1.2.3 From 22bad38286d9a652d7061a02f9743bb2ebb84e59 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 20 Aug 2018 10:12:48 +0530 Subject: uprobes/sdt: Prevent multiple reference counter for same uprobe We assume to have only one reference counter for one uprobe. Don't allow user to register multiple uprobes having same inode+offset but different reference counter. Link: http://lkml.kernel.org/r/20180820044250.11659-3-ravi.bangoria@linux.ibm.com Acked-by: Srikar Dronamraju Reviewed-by: Oleg Nesterov Reviewed-by: Song Liu Tested-by: Song Liu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/events/uprobes.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 934feb39f6be..96fb51f3994f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -679,6 +679,16 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } +static void +ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) +{ + pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", + uprobe->inode->i_ino, (unsigned long long) uprobe->offset, + (unsigned long long) cur_uprobe->ref_ctr_offset, + (unsigned long long) uprobe->ref_ctr_offset); +} + static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { @@ -698,6 +708,12 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { + if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { + ref_ctr_mismatch_warn(cur_uprobe, uprobe); + put_uprobe(cur_uprobe); + kfree(uprobe); + return ERR_PTR(-EINVAL); + } kfree(uprobe); uprobe = cur_uprobe; } @@ -1112,6 +1128,9 @@ static int __uprobe_register(struct inode *inode, loff_t offset, uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); + /* * We can race with uprobe_unregister()->delete_uprobe(). * Check uprobe_is_active() and retry if it is false. -- cgit v1.2.3 From ccea8727dc27d8f46df6557f9260ab32760ef409 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 20 Aug 2018 10:12:49 +0530 Subject: trace_uprobe/sdt: Prevent multiple reference counter for same uprobe We assume to have only one reference counter for one uprobe. Don't allow user to add multiple trace_uprobe entries having same inode+offset but different reference counter. Ex, # echo "p:sdt_tick/loop2 /home/ravi/tick:0x6e4(0x10036)" > uprobe_events # echo "p:sdt_tick/loop2_1 /home/ravi/tick:0x6e4(0xfffff)" >> uprobe_events bash: echo: write error: Invalid argument # dmesg trace_kprobe: Reference counter offset mismatch. There is one exception though: When user is trying to replace the old entry with the new one, we allow this if the new entry does not conflict with any other existing entries. Link: http://lkml.kernel.org/r/20180820044250.11659-4-ravi.bangoria@linux.ibm.com Acked-by: Srikar Dronamraju Reviewed-by: Song Liu Reviewed-by: Oleg Nesterov Tested-by: Song Liu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7b85172beab6..3a7c73c40007 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -312,6 +312,35 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) return 0; } +/* + * Uprobe with multiple reference counter is not allowed. i.e. + * If inode and offset matches, reference counter offset *must* + * match as well. Though, there is one exception: If user is + * replacing old trace_uprobe with new one(same group/event), + * then we allow same uprobe with new reference counter as far + * as the new one does not conflict with any other existing + * ones. + */ +static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +{ + struct trace_uprobe *tmp, *old = NULL; + struct inode *new_inode = d_real_inode(new->path.dentry); + + old = find_probe_event(trace_event_name(&new->tp.call), + new->tp.call.class->system); + + list_for_each_entry(tmp, &uprobe_list, list) { + if ((old ? old != tmp : true) && + new_inode == d_real_inode(tmp->path.dentry) && + new->offset == tmp->offset && + new->ref_ctr_offset != tmp->ref_ctr_offset) { + pr_warn("Reference counter offset mismatch."); + return ERR_PTR(-EINVAL); + } + } + return old; +} + /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { @@ -321,8 +350,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(trace_event_name(&tu->tp.call), - tu->tp.call.class->system); + old_tu = find_old_trace_uprobe(tu); + if (IS_ERR(old_tu)) { + ret = PTR_ERR(old_tu); + goto end; + } + if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); -- cgit v1.2.3 From ca9184f0797c4893db928527fb2b6999eb753ccb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 23 Jul 2018 11:31:59 +0300 Subject: tracing: Trivia spelling fix containerof() -> container_of() This is the only location on kernel that has wrong spelling of the container_of() helper. Fix it. Signed-off-by: Andy Shevchenko Acked-by: Steven Rostedt (VMware) Signed-off-by: Andy Shevchenko --- kernel/trace/trace_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b0875b327f5c..c3fd849d4a8f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -115,7 +115,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, * section, then we need to read the link list pointers. The trick is * we pass the address of the string to the seq function just like * we do for the kernel core formats. To get back the structure that - * holds the format, we simply use containerof() and then go to the + * holds the format, we simply use container_of() and then go to the * next format in the list. */ static const char ** -- cgit v1.2.3 From a6ca88b241d5e929e6e60b12ad8cd288f0ffa256 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 1 Oct 2018 22:36:36 -0700 Subject: trace_uprobe: support reference counter in fd-based uprobe This patch enables uprobes with reference counter in fd-based uprobe. Highest 32 bits of perf_event_attr.config is used to stored offset of the reference count (semaphore). Format information in /sys/bus/event_source/devices/uprobe/format/ is updated to reflect this new feature. Link: http://lkml.kernel.org/r/20181002053636.1896903-1-songliubraving@fb.com Cc: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Reviewed-and-tested-by: Ravi Bangoria Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 3 ++- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++--------- kernel/trace/trace_event_perf.c | 7 +++--- kernel/trace/trace_probe.h | 3 ++- kernel/trace/trace_uprobe.c | 4 +++- 5 files changed, 50 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 78a010e19ed4..4130a5497d40 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -575,7 +575,8 @@ extern int bpf_get_kprobe_info(const struct perf_event *event, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS -extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); +extern int perf_uprobe_init(struct perf_event *event, + unsigned long ref_ctr_offset, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, diff --git a/kernel/events/core.c b/kernel/events/core.c index c80549bf82c6..65b30773af3e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8368,30 +8368,39 @@ static struct pmu perf_tracepoint = { * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe + * + * The following values specify a reference counter (or semaphore in the + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. + * + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); +#endif -static struct attribute *probe_attrs[] = { +#ifdef CONFIG_KPROBE_EVENTS +static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; -static struct attribute_group probe_format_group = { +static struct attribute_group kprobe_format_group = { .name = "format", - .attrs = probe_attrs, + .attrs = kprobe_attrs, }; -static const struct attribute_group *probe_attr_groups[] = { - &probe_format_group, +static const struct attribute_group *kprobe_attr_groups[] = { + &kprobe_format_group, NULL, }; -#endif -#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8401,7 +8410,7 @@ static struct pmu perf_kprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) @@ -8433,6 +8442,24 @@ static int perf_kprobe_event_init(struct perf_event *event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); + +static struct attribute *uprobe_attrs[] = { + &format_attr_retprobe.attr, + &format_attr_ref_ctr_offset.attr, + NULL, +}; + +static struct attribute_group uprobe_format_group = { + .name = "format", + .attrs = uprobe_attrs, +}; + +static const struct attribute_group *uprobe_attr_groups[] = { + &uprobe_format_group, + NULL, +}; + static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, @@ -8442,12 +8469,13 @@ static struct pmu perf_uprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; + unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) @@ -8463,7 +8491,8 @@ static int perf_uprobe_event_init(struct perf_event *event) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; - err = perf_uprobe_init(event, is_retprobe); + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 69a3fe926e8c..76217bbef815 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -290,7 +290,8 @@ void perf_kprobe_destroy(struct perf_event *p_event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS -int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; @@ -312,8 +313,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } - tp_event = create_local_trace_uprobe( - path, p_event->attr.probe_offset, is_retprobe); + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5f52668e165d..03b10f3201a5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -412,6 +412,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); extern struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3a7c73c40007..d09638706fe0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1405,7 +1405,8 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) #ifdef CONFIG_PERF_EVENTS struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return) { struct trace_uprobe *tu; struct path path; @@ -1437,6 +1438,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) tu->offset = offs; tu->path = path; + tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); -- cgit v1.2.3 From 56de763052792669d61d79a087611da9a7f04d4e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:16:36 +0900 Subject: tracing: probeevent: Cleanup print argument functions Cleanup the print-argument function to decouple it into print-name and print-value, so that it can support more flexible expression, like array type. Link: http://lkml.kernel.org/r/152465859635.26224.13452846788717102315.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 20 ++++++-------------- kernel/trace/trace_probe.c | 12 +++++------- kernel/trace/trace_probe.h | 19 ++++++++++++++++--- kernel/trace/trace_uprobe.c | 9 ++------- 4 files changed, 29 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 508396edc56a..6326c71181aa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1136,8 +1136,6 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1149,11 +1147,9 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); out: @@ -1167,8 +1163,6 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1185,11 +1179,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e99c3ce7aa65..e2c184eaa7db 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -26,10 +26,9 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ -int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ { \ - trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, fmt, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ @@ -49,15 +48,14 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") /* Print type function for string type */ -int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) { int len = *(u32 *)data >> 16; if (!len) - trace_seq_printf(s, " %s=(fault)", name); + trace_seq_puts(s, "(fault)"); else - trace_seq_printf(s, " %s=\"%s\"", name, + trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); return !trace_seq_has_overflowed(s); } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 03b10f3201a5..8254a061ac35 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -82,7 +82,7 @@ static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); +typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); /* Fetch types */ enum { @@ -124,8 +124,7 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -403,6 +402,20 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } } +static inline int +print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + int i; + + for (i = 0; i < nr_args; i++) { + trace_seq_printf(s, " %s=", args[i].name); + if (!args[i].type->print(s, data + args[i].offset, field)) + return -ENOMEM; + } + return 0; +} + extern int set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d09638706fe0..c55753e1079e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -892,7 +892,6 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; - int i; entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, tp.call.event); @@ -909,12 +908,8 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto out; - } + if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + goto out; trace_seq_putc(s, '\n'); -- cgit v1.2.3 From eeb07b0615004bce145015b704de85fd3ac6cce0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:17:05 +0900 Subject: tracing: probeevent: Cleanup argument field definition Cleanup event argument definition code in one place for maintenancability. Link: http://lkml.kernel.org/r/152465862529.26224.9068605421476018902.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 32 ++++---------------------------- kernel/trace/trace_probe.c | 21 +++++++++++++++++++++ kernel/trace/trace_probe.h | 2 ++ kernel/trace/trace_uprobe.c | 15 ++------------- 4 files changed, 29 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6326c71181aa..1356927e32d0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1192,49 +1192,25 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, static int kprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kretprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e2c184eaa7db..21af28ffba3a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -668,3 +668,24 @@ int set_print_fmt(struct trace_probe *tp, bool is_return) return 0; } + +int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp) +{ + int ret, i; + + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + struct probe_arg *parg = &tp->args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + offset + parg->offset, + parg->type->size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8254a061ac35..a1df7763b797 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -429,3 +429,5 @@ create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif +extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c55753e1079e..28a8f69cec89 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1018,7 +1018,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) static int uprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i, size; + int ret, size; struct uprobe_trace_entry_head field; struct trace_uprobe *tu = event_call->data; @@ -1030,19 +1030,8 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); size = SIZEOF_TRACE_ENTRY(false); } - /* Set argument names as fields */ - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, size + parg->offset, - parg->type->size, parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, size, &tu->tp); } #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From 7bfbc63eda08b8158c040d6882c807f62b0750bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:17:34 +0900 Subject: tracing: probeevent: Remove NOKPROBE_SYMBOL from print functions Remove unneeded NOKPROBE_SYMBOL from print functions since the print functions are only used when printing out the trace data, and not from kprobe handler. Link: http://lkml.kernel.org/r/152465865422.26224.10111548170594014954.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 21af28ffba3a..5f3b5b3fd2cd 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -31,8 +31,7 @@ int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ trace_seq_printf(s, fmt, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") @@ -59,7 +58,6 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) (const char *)get_loc_data(data, ent)); return !trace_seq_has_overflowed(s); } -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -- cgit v1.2.3 From 533059281ee594f9fbb9e58042aaec77083ef251 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:18:03 +0900 Subject: tracing: probeevent: Introduce new argument fetching code Replace {k,u}probe event argument fetching framework with switch-case based. Currently that is implemented with structures, macros and chain of function-pointers, which is more complicated than necessary and may get a performance penalty by retpoline. This simplify that with an array of "fetch_insn" (opcode and oprands), and make process_fetch_insn() just interprets it. No function pointers are used. Link: http://lkml.kernel.org/r/152465868340.26224.2551120475197839464.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 291 +++++++++++++---------------- kernel/trace/trace_probe.c | 401 +++++++++++----------------------------- kernel/trace/trace_probe.h | 230 +++++------------------ kernel/trace/trace_probe_tmpl.h | 120 ++++++++++++ kernel/trace/trace_uprobe.c | 127 ++++++++----- 5 files changed, 491 insertions(+), 678 deletions(-) create mode 100644 kernel/trace/trace_probe_tmpl.h (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1356927e32d0..c024cc40d509 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -14,6 +14,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -120,160 +121,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -/* - * Kprobes-specific fetch functions - */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); - -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); - -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(dst, addr, maxlen); - - if (ret < 0) { /* Failed to fetch string */ - dst[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); - } -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); - -/* Return the length of string -- including null terminal byte */ -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); - -#define DEFINE_FETCH_symbol(type) \ -void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); - -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* kprobes don't support file_offset fetch methods */ -#define fetch_file_offset_u8 NULL -#define fetch_file_offset_u16 NULL -#define fetch_file_offset_u32 NULL -#define fetch_file_offset_u64 NULL -#define fetch_file_offset_string NULL -#define fetch_file_offset_string_size NULL - /* Fetch type information table */ static const struct fetch_type kprobes_fetch_type_table[] = { /* Special types */ @@ -529,7 +376,7 @@ static bool within_notrace_func(struct trace_kprobe *tk) /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { - int i, ret; + int ret; if (trace_probe_is_registered(&tk->tp)) return -EINVAL; @@ -540,9 +387,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; } - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_update_arg(&tk->tp.args[i]); - /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(&tk->tp)) tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; @@ -876,8 +720,8 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true, - kprobes_fetch_type_table); + is_return, true, + kprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -1031,6 +875,133 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Kprobe specific fetch functions */ + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline void +fetch_store_strlen(unsigned long addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline void +fetch_store_string(unsigned long addr, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + dst[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); + } +} + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + bool pre) +{ + unsigned long val; + int ret; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + default: + return -EILSEQ; + } + code++; + + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + ret = probe_kernel_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + /* 3rd stage: store value to buffer */ + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_kernel_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + if (pre) + fetch_store_strlen(val + code->offset, dest); + else + fetch_store_string(val + code->offset, dest); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + + return code->op == FETCH_OP_END ? 0 : -EILSEQ; +} +NOKPROBE_SYMBOL(process_fetch_insn) + /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5f3b5b3fd2cd..c59c69cb2f2e 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -61,174 +61,6 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; - fetch_func_t fetch; - fetch_func_t fetch_size; -}; - -#define DEFINE_FETCH_deref(type) \ -void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - dprm->fetch(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) - -void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - struct deref_fetch_param *dprm = data; - unsigned long addr; - - call_fetch(&dprm->orig, regs, &addr); - if (addr && dprm->fetch_size) { - addr += dprm->offset; - dprm->fetch_size(regs, (void *)addr, dest); - } else - *(string_size *)dest = 0; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); - -static void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} -NOKPROBE_SYMBOL(update_deref_fetch_param); - -static void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} -NOKPROBE_SYMBOL(free_deref_fetch_param); - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - - kfree(data); -} - -void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, - void *data, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - ret = strlcpy(dst, current->comm, maxlen); - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); - -void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - *(u32 *)dest = strlen(current->comm) + 1; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); - static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -272,37 +104,6 @@ fail: return NULL; } -/* Special function : only accept unsigned long */ -static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_kernel_stack_address); - -static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = user_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_user_stack_address); - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn, - const struct fetch_type *ftbl) -{ - int i; - - if (type != &ftbl[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - - return NULL; -} - /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -327,7 +128,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, + struct fetch_insn *code, bool is_return, bool is_kprobe) { int ret = 0; @@ -335,33 +136,24 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; + code->op = FETCH_OP_RETVAL; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) - return -EINVAL; - - if (is_kprobe) - f->fn = fetch_kernel_stack_address; - else - f->fn = fetch_user_stack_address; + code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); if (ret || (is_kprobe && param > PARAM_MAX_STACK)) ret = -EINVAL; else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; } } else ret = -EINVAL; } else if (strcmp(arg, "comm") == 0) { - if (strcmp(t->name, "string") != 0 && - strcmp(t->name, "string_size") != 0) - return -EINVAL; - f->fn = t->fetch[FETCH_MTD_comm]; + code->op = FETCH_OP_COMM; } else ret = -EINVAL; @@ -369,10 +161,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } /* Recursive argument parser */ -static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) +static int +parse_probe_arg(char *arg, const struct fetch_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { + struct fetch_insn *code = *pcode; unsigned long param; long offset; char *tmp; @@ -380,14 +175,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + ret = parse_probe_vars(arg + 1, type, code, + is_return, is_kprobe); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; + code->op = FETCH_OP_REG; + code->param = (unsigned int)ret; ret = 0; } break; @@ -397,9 +193,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; - - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; + /* load address */ + code->op = FETCH_OP_IMM; + code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ if (is_kprobe) @@ -409,8 +205,8 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) break; - f->fn = t->fetch[FETCH_MTD_file_offset]; - f->data = (void *)offset; + code->op = FETCH_OP_FOFFS; + code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ if (!is_kprobe) @@ -420,10 +216,19 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) break; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; + code->op = FETCH_OP_IMM; + code->immediate = + (unsigned long)kallsyms_lookup_name(arg + 1); + if (!code->immediate) + return -ENOENT; + code->immediate += offset; } + /* These are fetching from memory */ + if (++code == end) + return -E2BIG; + *pcode = code; + code->op = FETCH_OP_DEREF; + code->offset = offset; break; case '+': /* deref memory */ @@ -431,11 +236,10 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '-': tmp = strchr(arg, '('); if (!tmp) - break; + return -EINVAL; *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) break; @@ -443,36 +247,29 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, tmp = strrchr(arg, ')'); if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2; + const struct fetch_type *t2; t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); - - if (!dprm) - return -ENOMEM; - - dprm->offset = offset; - dprm->fetch = t->fetch[FETCH_MTD_memory]; - dprm->fetch_size = get_fetch_size_function(t, - dprm->fetch, ftbl); - ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe, ftbl); + ret = parse_probe_arg(arg, t2, &code, end, is_return, + is_kprobe, ftbl); if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } + break; + if (code->op == FETCH_OP_COMM) + return -EINVAL; + if (++code == end) + return -E2BIG; + *pcode = code; + + code->op = FETCH_OP_DEREF; + code->offset = offset; } break; } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", t->name); + if (!ret && code->op == FETCH_OP_NOP) { + /* Parsed, but do not find fetch method */ ret = -EINVAL; } - return ret; } @@ -481,22 +278,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, - struct fetch_param *f) + struct fetch_insn **pcode) { - struct bitfield_fetch_param *bprm; + struct fetch_insn *code = *pcode; unsigned long bw, bo; char *tail; if (*bf != 'b') return 0; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ if (bw == 0 || *tail != '@') @@ -507,9 +297,15 @@ static int __parse_bitfield_probe_arg(const char *bf, if (tail == bf || *tail != '/') return -EINVAL; + code++; + if (code->op != FETCH_OP_NOP) + return -E2BIG; + *pcode = code; - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; + code->op = FETCH_OP_MOD_BF; + code->lshift = BYTES_TO_BITS(t->size) - (bw + bo); + code->rshift = BYTES_TO_BITS(t->size) - bw; + code->basesize = t->size; return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } @@ -519,6 +315,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe, const struct fetch_type *ftbl) { + struct fetch_insn *code, *tmp = NULL; const char *t; int ret; @@ -549,18 +346,60 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, } parg->offset = *size; *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, - is_kprobe, ftbl); - - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn, - ftbl); - parg->fetch_size.data = parg->fetch.data; + code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + if (!code) + return -ENOMEM; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + is_return, is_kprobe, ftbl); + if (ret) + goto fail; + + /* Store operation */ + if (!strcmp(parg->type->name, "string")) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && + code->op != FETCH_OP_COMM) { + pr_info("string only accepts memory or address.\n"); + ret = -EINVAL; + goto fail; + } + /* Since IMM or COMM must be the 1st insn, this is safe */ + if (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) + code++; + code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + parg->dynamic = true; + } else if (code->op == FETCH_OP_DEREF) { + code->op = FETCH_OP_ST_MEM; + code->size = parg->type->size; + } else { + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_ST_RAW; + code->size = parg->type->size; + } + /* Modify operation */ + if (t != NULL) { + ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (ret) + goto fail; } + code++; + code->op = FETCH_OP_END; + + /* Shrink down the code buffer */ + parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + if (!parg->code) + ret = -ENOMEM; + else + memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); + +fail: + kfree(tmp); return ret; } @@ -582,25 +421,9 @@ int traceprobe_conflict_field_name(const char *name, return 0; } -void traceprobe_update_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - void traceprobe_free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); - + kfree(arg->code); kfree(arg->name); kfree(arg->comm); } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a1df7763b797..42c724a7ad11 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -79,25 +79,50 @@ static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) return (u8 *)ent + get_rloc_offs(*dl); } -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_comm, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_file_offset, - FETCH_MTD_END, +enum fetch_op { + FETCH_OP_NOP = 0, + // Stage 1 (load) ops + FETCH_OP_REG, /* Register : .param = offset */ + FETCH_OP_STACK, /* Stack : .param = index */ + FETCH_OP_STACKP, /* Stack pointer */ + FETCH_OP_RETVAL, /* Return value */ + FETCH_OP_IMM, /* Immediate : .immediate */ + FETCH_OP_COMM, /* Current comm */ + FETCH_OP_FOFFS, /* File offset: .immediate */ + // Stage 2 (dereference) op + FETCH_OP_DEREF, /* Dereference: .offset */ + // Stage 3 (store) ops + FETCH_OP_ST_RAW, /* Raw: .size */ + FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_STRING, /* String: .offset, .size */ + // Stage 4 (modify) op + FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + FETCH_OP_END, }; +struct fetch_insn { + enum fetch_op op; + union { + unsigned int param; + struct { + unsigned int size; + int offset; + }; + struct { + unsigned char basesize; + unsigned char lshift; + unsigned char rshift; + }; + unsigned long immediate; + }; +}; + +/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ +#define FETCH_INSN_MAX 16 + /* Fetch type information table */ struct fetch_type { const char *name; /* Name of type */ @@ -106,13 +131,6 @@ struct fetch_type { print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -}; - -struct fetch_param { - fetch_func_t fn; - void *data; }; /* For defining macros, define string/string_size types */ @@ -142,66 +160,12 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x64); DECLARE_BASIC_PRINT_TYPE_FUNC(string); -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type - -/* Declare macro for basic types */ -#define DECLARE_FETCH_FUNC(method, type) \ -extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ - void *data, void *dest) - -#define DECLARE_BASIC_FETCH_FUNCS(method) \ -DECLARE_FETCH_FUNC(method, u8); \ -DECLARE_FETCH_FUNC(method, u16); \ -DECLARE_FETCH_FUNC(method, u32); \ -DECLARE_FETCH_FUNC(method, u64) - -DECLARE_BASIC_FETCH_FUNCS(reg); -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(retval); -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(symbol); -DECLARE_FETCH_FUNC(symbol, string); -DECLARE_FETCH_FUNC(symbol, string_size); - -DECLARE_BASIC_FETCH_FUNCS(deref); -DECLARE_FETCH_FUNC(deref, string); -DECLARE_FETCH_FUNC(deref, string_size); - -DECLARE_BASIC_FETCH_FUNCS(bitfield); -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -/* comm only makes sense as a string */ -#define fetch_comm_u8 NULL -#define fetch_comm_u16 NULL -#define fetch_comm_u32 NULL -#define fetch_comm_u64 NULL -DECLARE_FETCH_FUNC(comm, string); -DECLARE_FETCH_FUNC(comm, string_size); - -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) - /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) x##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ {.name = _name, \ .size = _size, \ @@ -209,17 +173,6 @@ DEFINE_FETCH_##method(u64) .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(comm, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ -ASSIGN_FETCH_FUNC(file_offset, ftype), \ - } \ } #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ @@ -231,42 +184,13 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define ASSIGN_FETCH_TYPE_END {} -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 #ifdef CONFIG_KPROBE_EVENTS -struct symbol_cache; -unsigned long update_symbol_cache(struct symbol_cache *sc); -void free_symbol_cache(struct symbol_cache *sc); -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); bool trace_kprobe_on_func_entry(struct trace_event_call *call); bool trace_kprobe_error_injectable(struct trace_event_call *call); #else -/* uprobes do not support symbol fetch methods */ -#define fetch_symbol_u8 NULL -#define fetch_symbol_u16 NULL -#define fetch_symbol_u32 NULL -#define fetch_symbol_u64 NULL -#define fetch_symbol_string NULL -#define fetch_symbol_string_size NULL - -struct symbol_cache { -}; -static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) -{ - return 0; -} - -static inline void __used free_symbol_cache(struct symbol_cache *sc) -{ -} - -static inline struct symbol_cache * __used -alloc_symbol_cache(const char *sym, long offset) -{ - return NULL; -} - static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { return false; @@ -279,8 +203,8 @@ static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; + struct fetch_insn *code; + bool dynamic;/* Dynamic array (string) is used */ unsigned int offset; /* Offset from argument entry */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ @@ -312,12 +236,6 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static nokprobe_inline void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -354,68 +272,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -/* Sum up total data length for dynamic arraies (strings) */ -static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static nokprobe_inline void -store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - -static inline int -print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, - u8 *data, void *field) -{ - int i; - - for (i = 0; i < nr_args; i++) { - trace_seq_printf(s, " %s=", args[i].name); - if (!args[i].type->print(s, data + args[i].offset, field)) - return -ENOMEM; - } - return 0; -} - extern int set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h new file mode 100644 index 000000000000..c8a5272abf01 --- /dev/null +++ b/kernel/trace/trace_probe_tmpl.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Traceprobe fetch helper inlines + */ + +static nokprobe_inline void +fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf) +{ + switch (code->size) { + case 1: + *(u8 *)buf = (u8)val; + break; + case 2: + *(u16 *)buf = (u16)val; + break; + case 4: + *(u32 *)buf = (u32)val; + break; + case 8: + //TBD: 32bit signed + *(u64 *)buf = (u64)val; + break; + default: + *(unsigned long *)buf = val; + } +} + +static nokprobe_inline void +fetch_apply_bitfield(struct fetch_insn *code, void *buf) +{ + switch (code->basesize) { + case 1: + *(u8 *)buf <<= code->lshift; + *(u8 *)buf >>= code->rshift; + break; + case 2: + *(u16 *)buf <<= code->lshift; + *(u16 *)buf >>= code->rshift; + break; + case 4: + *(u32 *)buf <<= code->lshift; + *(u32 *)buf >>= code->rshift; + break; + case 8: + *(u64 *)buf <<= code->lshift; + *(u64 *)buf >>= code->rshift; + break; + } +} + +/* Define this for each callsite */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, + void *dest, bool pre); + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_arg *arg; + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + process_fetch_insn(arg->code, regs, &len, true); + ret += len; + } + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + struct probe_arg *arg; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + int i; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + arg->offset); + *dl = make_data_rloc(maxlen, end - arg->offset); + /* Then try to fetch string or dynamic array data */ + process_fetch_insn(arg->code, regs, dl, false); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, ent_size + arg->offset); + } else + /* Just fetching data normally */ + process_fetch_insn(arg->code, regs, data + arg->offset, + false); + } +} + +static inline int +print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + int i; + + for (i = 0; i < nr_args; i++) { + trace_seq_printf(s, " %s=", args[i].name); + if (!args[i].type->print(s, data + args[i].offset, field)) + return -ENOMEM; + } + return 0; +} diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 28a8f69cec89..e076f89ab33a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -15,6 +15,7 @@ #include #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define UPROBE_EVENT_SYSTEM "uprobes" @@ -99,37 +100,19 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) /* * Uprobes-specific fetch functions */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)get_user_stack_nth(regs, \ - ((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - void __user *vaddr = (void __force __user *) addr; \ - \ - if (copy_from_user(&retval, vaddr, sizeof(type))) \ - *(type *)dest = 0; \ - else \ - *(type *) dest = retval; \ +static nokprobe_inline int +probe_user_read(void *dest, void *src, size_t size) +{ + void __user *vaddr = (void __force __user *)src; + + return copy_from_user(dest, vaddr, size); } -DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static nokprobe_inline void +fetch_store_string(unsigned long addr, void *dest) { long ret; u32 rloc = *(u32 *)dest; @@ -152,8 +135,9 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, } } -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline void +fetch_store_strlen(unsigned long addr, void *dest) { int len; void __user *vaddr = (void __force __user *) addr; @@ -166,7 +150,7 @@ static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, *(u32 *)dest = len; } -static unsigned long translate_user_vaddr(void *file_offset) +static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; struct uprobe_dispatch_data *udd; @@ -174,21 +158,9 @@ static unsigned long translate_user_vaddr(void *file_offset) udd = (void *) current->utask->vaddr; base_addr = udd->bp_addr - udd->tu->offset; - return base_addr + (unsigned long)file_offset; + return base_addr + file_offset; } -#define DEFINE_FETCH_file_offset(type) \ -static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ - void *offset, void *dest)\ -{ \ - void *vaddr = (void *)translate_user_vaddr(offset); \ - \ - FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ -} -DEFINE_BASIC_FETCH_FUNCS(file_offset) -DEFINE_FETCH_file_offset(string) -DEFINE_FETCH_file_offset(string_size) - /* Fetch type information table */ static const struct fetch_type uprobes_fetch_type_table[] = { /* Special types */ @@ -213,6 +185,77 @@ static const struct fetch_type uprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE_END }; +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + bool pre) +{ + unsigned long val; + int ret; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = get_user_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = user_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_FOFFS: + val = translate_user_vaddr(code->immediate); + break; + default: + return -EILSEQ; + } + code++; + + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + ret = probe_user_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + /* 3rd stage: store value to buffer */ + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_user_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + if (pre) + fetch_store_strlen(val + code->offset, dest); + else + fetch_store_string(val + code->offset, dest); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + + return code->op == FETCH_OP_END ? 0 : -EILSEQ; +} +NOKPROBE_SYMBOL(process_fetch_insn) + static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); -- cgit v1.2.3 From f451bc89d8357f010304564728ba7c5d38a1d4d5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:18:32 +0900 Subject: tracing: probeevent: Unify fetch type tables Unify {k,u}probe_fetch_type_table to probe_fetch_type_table because the main difference of those type tables (fetcharg methods) are gone. Now we can consolidate it. Link: http://lkml.kernel.org/r/152465871274.26224.13999436317830479698.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 +---------------------- kernel/trace/trace_probe.c | 54 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace_probe.h | 6 +---- kernel/trace/trace_uprobe.c | 27 +---------------------- 4 files changed, 39 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c024cc40d509..dc1c638daf44 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -121,30 +121,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Fetch type information table */ -static const struct fetch_type kprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; - /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -720,8 +696,7 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true, - kprobes_fetch_type_table); + is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c59c69cb2f2e..d06e67cca3e1 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -61,8 +61,29 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -static const struct fetch_type *find_fetch_type(const char *type, - const struct fetch_type *ftbl) +/* Fetch type information table */ +static const struct fetch_type probe_fetch_types[] = { + /* Special types */ + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + "__data_loc char[]"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + + ASSIGN_FETCH_TYPE_END +}; + +static const struct fetch_type *find_fetch_type(const char *type) { int i; @@ -83,21 +104,21 @@ static const struct fetch_type *find_fetch_type(const char *type, switch (bs) { case 8: - return find_fetch_type("u8", ftbl); + return find_fetch_type("u8"); case 16: - return find_fetch_type("u16", ftbl); + return find_fetch_type("u16"); case 32: - return find_fetch_type("u32", ftbl); + return find_fetch_type("u32"); case 64: - return find_fetch_type("u64", ftbl); + return find_fetch_type("u64"); default: goto fail; } } - for (i = 0; ftbl[i].name; i++) { - if (strcmp(type, ftbl[i].name) == 0) - return &ftbl[i]; + for (i = 0; probe_fetch_types[i].name; i++) { + if (strcmp(type, probe_fetch_types[i].name) == 0) + return &probe_fetch_types[i]; } fail: @@ -164,8 +185,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) + bool is_return, bool is_kprobe) { struct fetch_insn *code = *pcode; unsigned long param; @@ -247,12 +267,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, tmp = strrchr(arg, ')'); if (tmp) { - const struct fetch_type *t2; + const struct fetch_type *t2 = find_fetch_type(NULL); - t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; ret = parse_probe_arg(arg, t2, &code, end, is_return, - is_kprobe, ftbl); + is_kprobe); if (ret) break; if (code->op == FETCH_OP_COMM) @@ -312,8 +331,7 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) + struct probe_arg *parg, bool is_return, bool is_kprobe) { struct fetch_insn *code, *tmp = NULL; const char *t; @@ -339,7 +357,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, */ if (!t && strcmp(arg, "$comm") == 0) t = "string"; - parg->type = find_fetch_type(t, ftbl); + parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; @@ -353,7 +371,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - is_return, is_kprobe, ftbl); + is_return, is_kprobe); if (ret) goto fail; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42c724a7ad11..5c262ed6347c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -184,9 +184,6 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(string); #define ASSIGN_FETCH_TYPE_END {} -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - #ifdef CONFIG_KPROBE_EVENTS bool trace_kprobe_on_func_entry(struct trace_event_call *call); bool trace_kprobe_error_injectable(struct trace_event_call *call); @@ -261,8 +258,7 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) } extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl); + struct probe_arg *parg, bool is_return, bool is_kprobe); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e076f89ab33a..7772fec84c12 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -161,30 +161,6 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) return base_addr + file_offset; } -/* Fetch type information table */ -static const struct fetch_type uprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; - /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, @@ -621,8 +597,7 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false, - uprobes_fetch_type_table); + is_return, false); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; -- cgit v1.2.3 From 9178412ddf5a98feba0ad3986111c5ad10eb9e59 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:19:01 +0900 Subject: tracing: probeevent: Return consumed bytes of dynamic area Cleanup string fetching routine so that returns the consumed bytes of dynamic area and store the string information as data_loc format instead of data_rloc. This simplifies the fetcharg loop. Link: http://lkml.kernel.org/r/152465874163.26224.12125143907501289031.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 57 ++++++++++++++++++-------------------- kernel/trace/trace_probe.h | 26 ++++-------------- kernel/trace/trace_probe_tmpl.h | 54 +++++++++++++++++------------------- kernel/trace/trace_uprobe.c | 61 ++++++++++++++++++++--------------------- 4 files changed, 88 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index dc1c638daf44..7e5064f8ab8f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -853,8 +853,8 @@ static const struct file_operations kprobe_profile_ops = { /* Kprobe specific fetch functions */ /* Return the length of string -- including null terminal byte */ -static nokprobe_inline void -fetch_store_strlen(unsigned long addr, void *dest) +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) { mm_segment_t old_fs; int ret, len = 0; @@ -872,47 +872,40 @@ fetch_store_strlen(unsigned long addr, void *dest) pagefault_enable(); set_fs(old_fs); - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; + return (ret < 0) ? ret : len; } /* * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max * length and relative data location. */ -static nokprobe_inline void -fetch_store_string(unsigned long addr, void *dest) +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); + int maxlen = get_loc_len(*(u32 *)dest); + u8 *dst = get_loc_data(dest, base); long ret; - if (!maxlen) - return; - + if (unlikely(!maxlen)) + return -ENOMEM; /* * Try to get string again, since the string can be changed while * probing. */ ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); - if (ret < 0) { /* Failed to fetch string */ - dst[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); - } + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + return ret; } /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, - bool pre) + void *base) { unsigned long val; - int ret; + int ret = 0; /* 1st stage: get value from context */ switch (code->op) { @@ -949,6 +942,13 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) + return fetch_store_strlen(val + code->offset); + else + return -EILSEQ; + } + switch (code->op) { case FETCH_OP_ST_RAW: fetch_store_raw(val, code, dest); @@ -957,10 +957,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, probe_kernel_read(dest, (void *)val + code->offset, code->size); break; case FETCH_OP_ST_STRING: - if (pre) - fetch_store_strlen(val + code->offset, dest); - else - fetch_store_string(val + code->offset, dest); + ret = fetch_store_string(val + code->offset, dest, base); break; default: return -EILSEQ; @@ -973,7 +970,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, code++; } - return code->op == FETCH_OP_END ? 0 : -EILSEQ; + return code->op == FETCH_OP_END ? ret : -EILSEQ; } NOKPROBE_SYMBOL(process_fetch_insn) @@ -1008,7 +1005,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry = ring_buffer_event_data(event); entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1057,7 +1054,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1203,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1239,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5c262ed6347c..b6bdd82fa485 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -54,29 +54,15 @@ #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 +/* data_loc: data location, compatible with u32 */ +#define make_data_loc(len, offs) \ + (((u32)(len) << 16) | ((u32)(offs) & 0xffff)) +#define get_loc_len(dl) ((u32)(dl) >> 16) +#define get_loc_offs(dl) ((u32)(dl) & 0xffff) -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -static nokprobe_inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} - -/* For data_loc conversion */ static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { - return (u8 *)ent + get_rloc_offs(*dl); + return (u8 *)ent + get_loc_offs(*dl); } /* Printing function type */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index c8a5272abf01..3b4aba6f84cc 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -48,24 +48,28 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) } } -/* Define this for each callsite */ +/* + * This must be defined for each callsite. + * Return consumed dynamic data size (>= 0), or error (< 0). + * If dest is NULL, don't store result and return required dynamic data size. + */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, - void *dest, bool pre); + void *dest, void *base); /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { struct probe_arg *arg; - int i, ret = 0; - u32 len; + int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; if (unlikely(arg->dynamic)) { - process_fetch_insn(arg->code, regs, &len, true); - ret += len; + len = process_fetch_insn(arg->code, regs, NULL, NULL); + if (len > 0) + ret += len; } } @@ -74,34 +78,26 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, - u8 *data, int maxlen) +store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, + int header_size, int maxlen) { struct probe_arg *arg; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - int i; + void *base = data - header_size; + void *dyndata = data + tp->size; + u32 *dl; /* Data location */ + int ret, i; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + arg->offset); - *dl = make_data_rloc(maxlen, end - arg->offset); - /* Then try to fetch string or dynamic array data */ - process_fetch_insn(arg->code, regs, dl, false); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, ent_size + arg->offset); - } else - /* Just fetching data normally */ - process_fetch_insn(arg->code, regs, data + arg->offset, - false); + dl = data + arg->offset; + /* Point the dynamic data area if needed */ + if (unlikely(arg->dynamic)) + *dl = make_data_loc(maxlen, dyndata - base); + ret = process_fetch_insn(arg->code, regs, dl, base); + if (unlikely(ret < 0 && arg->dynamic)) + *dl = make_data_loc(0, dyndata - base); + else + dyndata += ret; } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7772fec84c12..08ad51c8ebc0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -111,43 +111,38 @@ probe_user_read(void *dest, void *src, size_t size) * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static nokprobe_inline void -fetch_store_string(unsigned long addr, void *dest) +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) { long ret; - u32 rloc = *(u32 *)dest; - int maxlen = get_rloc_len(rloc); - u8 *dst = get_rloc_data(dest); + u32 loc = *(u32 *)dest; + int maxlen = get_loc_len(loc); + u8 *dst = get_loc_data(dest, base); void __user *src = (void __force __user *) addr; - if (!maxlen) - return; + if (unlikely(!maxlen)) + return -ENOMEM; ret = strncpy_from_user(dst, src, maxlen); - if (ret == maxlen) - dst[--ret] = '\0'; - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + if (ret >= 0) { + if (ret == maxlen) + dst[ret - 1] = '\0'; + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } + + return ret; } /* Return the length of string -- including null terminal byte */ -static nokprobe_inline void -fetch_store_strlen(unsigned long addr, void *dest) +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) { int len; void __user *vaddr = (void __force __user *) addr; len = strnlen_user(vaddr, MAX_STRING_SIZE); - if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; + return (len > MAX_STRING_SIZE) ? 0 : len; } static unsigned long translate_user_vaddr(unsigned long file_offset) @@ -164,10 +159,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, - bool pre) + void *base) { unsigned long val; - int ret; + int ret = 0; /* 1st stage: get value from context */ switch (code->op) { @@ -204,18 +199,22 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) + return fetch_store_strlen(val + code->offset); + else + return -EILSEQ; + } + switch (code->op) { case FETCH_OP_ST_RAW: fetch_store_raw(val, code, dest); break; case FETCH_OP_ST_MEM: - probe_user_read(dest, (void *)val + code->offset, code->size); + probe_kernel_read(dest, (void *)val + code->offset, code->size); break; case FETCH_OP_ST_STRING: - if (pre) - fetch_store_strlen(val + code->offset, dest); - else - fetch_store_string(val + code->offset, dest); + ret = fetch_store_string(val + code->offset, dest, base); break; default: return -EILSEQ; @@ -228,7 +227,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, code++; } - return code->op == FETCH_OP_END ? 0 : -EILSEQ; + return code->op == FETCH_OP_END ? ret : -EILSEQ; } NOKPROBE_SYMBOL(process_fetch_insn) @@ -1300,7 +1299,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, ucb, dsize); @@ -1335,7 +1334,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, ucb, dsize); -- cgit v1.2.3 From 0a46c8549f8c775ed6afac57a8b9fd7c4b4d156f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:19:30 +0900 Subject: tracing: probeevent: Append traceprobe_ for exported function Append traceprobe_ for exported function set_print_fmt() as same as other functions. Link: http://lkml.kernel.org/r/152465877071.26224.11143125027282999726.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7e5064f8ab8f..4895ca85ec79 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1371,7 +1371,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) init_trace_event_call(tk, call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); if (!ret) { @@ -1428,7 +1428,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk, &tk->tp.call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d06e67cca3e1..d119bf8c3b4f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -490,7 +490,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, return pos; } -int set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { int len; char *print_fmt; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b6bdd82fa485..c4e9d3d3216d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -254,7 +254,7 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int set_print_fmt(struct trace_probe *tp, bool is_return); +extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 08ad51c8ebc0..912cb2093944 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1370,7 +1370,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) init_trace_event_call(tu, call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1443,7 +1443,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } -- cgit v1.2.3 From 9b960a38835fcaf977f20dcc34ce9e54ff9563bd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:19:59 +0900 Subject: tracing: probeevent: Unify fetch_insn processing common part Unify the fetch_insn bottom process (from stage 2: dereference indirect data) from kprobe and uprobe events, since those are mostly same. Link: http://lkml.kernel.org/r/152465879965.26224.8547240824606804815.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 47 ++++++----------------------------- kernel/trace/trace_probe_tmpl.h | 55 ++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_uprobe.c | 43 ++------------------------------ 3 files changed, 63 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4895ca85ec79..fdd43f2f1fd1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -899,13 +899,18 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return ret; } +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + return probe_kernel_read(dest, src, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, void *base) { unsigned long val; - int ret = 0; /* 1st stage: get value from context */ switch (code->op) { @@ -932,45 +937,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } code++; - /* 2nd stage: dereference memory if needed */ - while (code->op == FETCH_OP_DEREF) { - ret = probe_kernel_read(&val, (void *)val + code->offset, - sizeof(val)); - if (ret) - return ret; - code++; - } - - /* 3rd stage: store value to buffer */ - if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) - return fetch_store_strlen(val + code->offset); - else - return -EILSEQ; - } - - switch (code->op) { - case FETCH_OP_ST_RAW: - fetch_store_raw(val, code, dest); - break; - case FETCH_OP_ST_MEM: - probe_kernel_read(dest, (void *)val + code->offset, code->size); - break; - case FETCH_OP_ST_STRING: - ret = fetch_store_string(val + code->offset, dest, base); - break; - default: - return -EILSEQ; - } - code++; - - /* 4th stage: modify stored value if needed */ - if (code->op == FETCH_OP_MOD_BF) { - fetch_apply_bitfield(code, dest); - code++; - } - - return code->op == FETCH_OP_END ? ret : -EILSEQ; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 3b4aba6f84cc..b4075f3e3a29 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -49,13 +49,66 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) } /* - * This must be defined for each callsite. + * These functions must be defined for each callsite. * Return consumed dynamic data size (>= 0), or error (< 0). * If dest is NULL, don't store result and return required dynamic data size. */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, void *base); +static nokprobe_inline int fetch_store_strlen(unsigned long addr); +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size); + +/* From the 2nd stage, routine is same */ +static nokprobe_inline int +process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, + void *dest, void *base) +{ + int ret = 0; + + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) + return fetch_store_strlen(val + code->offset); + else + return -EILSEQ; + } + + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_mem_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + ret = fetch_store_string(val + code->offset, dest, base); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + + return code->op == FETCH_OP_END ? ret : -EILSEQ; +} /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 912cb2093944..7154473ffaa4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -101,7 +101,7 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) * Uprobes-specific fetch functions */ static nokprobe_inline int -probe_user_read(void *dest, void *src, size_t size) +probe_mem_read(void *dest, void *src, size_t size) { void __user *vaddr = (void __force __user *)src; @@ -162,7 +162,6 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, void *base) { unsigned long val; - int ret = 0; /* 1st stage: get value from context */ switch (code->op) { @@ -189,45 +188,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, } code++; - /* 2nd stage: dereference memory if needed */ - while (code->op == FETCH_OP_DEREF) { - ret = probe_user_read(&val, (void *)val + code->offset, - sizeof(val)); - if (ret) - return ret; - code++; - } - - /* 3rd stage: store value to buffer */ - if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) - return fetch_store_strlen(val + code->offset); - else - return -EILSEQ; - } - - switch (code->op) { - case FETCH_OP_ST_RAW: - fetch_store_raw(val, code, dest); - break; - case FETCH_OP_ST_MEM: - probe_kernel_read(dest, (void *)val + code->offset, code->size); - break; - case FETCH_OP_ST_STRING: - ret = fetch_store_string(val + code->offset, dest, base); - break; - default: - return -EILSEQ; - } - code++; - - /* 4th stage: modify stored value if needed */ - if (code->op == FETCH_OP_MOD_BF) { - fetch_apply_bitfield(code, dest); - code++; - } - - return code->op == FETCH_OP_END ? ret : -EILSEQ; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) -- cgit v1.2.3 From 60c2e0cebfd01bd1bc5e8843f063264148d6b2bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:20:28 +0900 Subject: tracing: probeevent: Add symbol type Add "symbol" type to probeevent, which is an alias of u32 or u64 (depends on BITS_PER_LONG). This shows the result value in symbol+offset style. This type is only available with kprobe events. Link: http://lkml.kernel.org/r/152465882860.26224.14779072294412467338.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 2 ++ kernel/trace/trace.c | 2 +- kernel/trace/trace_probe.c | 8 ++++++++ kernel/trace/trace_probe.h | 12 +++++++++--- 4 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 8bfc75c90806..6224ddf34508 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -72,6 +72,8 @@ offset, and container-size (usually 32). The syntax is:: b@/ +Symbol type('symbol') is an alias of u32 or u64 type (depends on BITS_PER_LONG) +which shows given pointer in "symbol+offset" style. For $comm, the default type is "string"; any other type is invalid. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 147be8523560..1e3f28b1fa07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4626,7 +4626,7 @@ static const char readme_msg[] = "\t args: =fetcharg[:type]\n" "\t fetcharg: %, @
, @[+|-],\n" "\t $stack, $stack, $retval, $comm\n" - "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d119bf8c3b4f..1e7e0618577d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -46,6 +46,13 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) +{ + trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data); + return !trace_seq_has_overflowed(s); +} +const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS"; + /* Print type function for string type */ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) { @@ -79,6 +86,7 @@ static const struct fetch_type probe_fetch_types[] = { ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), ASSIGN_FETCH_TYPE_END }; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index c4e9d3d3216d..469110e0790b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -145,6 +145,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x32); DECLARE_BASIC_PRINT_TYPE_FUNC(x64); DECLARE_BASIC_PRINT_TYPE_FUNC(string); +DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) x##t @@ -152,6 +153,10 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(string); #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) +#define __ADDR_FETCH_TYPE(t) u##t +#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) +#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) + #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ {.name = _name, \ .size = _size, \ @@ -160,13 +165,14 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(string); .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ } - +#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) /* If ptype is an alias of atype, use this macro (show atype in format) */ #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) #define ASSIGN_FETCH_TYPE_END {} -- cgit v1.2.3 From 40b53b771806b1770837169cd32d1bf167fbccaf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:21:55 +0900 Subject: tracing: probeevent: Add array type support Add array type support for probe events. This allows user to get arraied types from memory address. The array type syntax is TYPE[N] Where TYPE is one of types (u8/16/32/64,s8/16/32/64, x8/16/32/64, symbol, string) and N is a fixed value less than 64. The string array type is a bit different from other types. For other base types, [1] is equal to (e.g. +0(%di):x32[1] is same as +0(%di):x32.) But string[1] is not equal to string. The string type itself represents "char array", but string array type represents "char * array". So, for example, +0(%di):string[1] is equal to +0(+0(%di)):string. Link: http://lkml.kernel.org/r/152465891533.26224.6150658225601339931.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 11 +++ kernel/trace/trace.c | 3 +- kernel/trace/trace_probe.c | 130 +++++++++++++++++++++++++++--------- kernel/trace/trace_probe.h | 14 ++++ kernel/trace/trace_probe_tmpl.h | 63 ++++++++++++++--- 5 files changed, 181 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 6224ddf34508..2dfed7a1ea6f 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -64,9 +64,20 @@ respectively. 'x' prefix implies it is unsigned. Traced arguments are shown in decimal ('s' and 'u') or hexadecimal ('x'). Without type casting, 'x32' or 'x64' is used depends on the architecture (e.g. x86-32 uses x32, and x86-64 uses x64). +These value types can be an array. To record array data, you can add '[N]' +(where N is a fixed number, less than 64) to the base type. +E.g. 'x16[4]' means an array of x16 (2bytes hex) with 4 elements. +Note that the array can be applied to memory type fetchargs, you can not +apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is +wrong, but '+8($stack):x8[8]' is OK.) String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. +The string array type is a bit different from other types. For other base +types, [1] is equal to (e.g. +0(%di):x32[1] is same +as +0(%di):x32.) But string[1] is not equal to string. The string type itself +represents "char array", but string array type represents "char * array". +So, for example, +0(%di):string[1] is equal to +0(+0(%di)):string. Bitfield is another special type, which takes 3 parameters, bit-width, bit- offset, and container-size (usually 32). The syntax is:: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1e3f28b1fa07..e7f99f513959 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4627,7 +4627,8 @@ static const char readme_msg[] = "\t fetcharg: %, @
, @[+|-],\n" "\t $stack, $stack, $retval, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" - "\t b@/\n" + "\t b@/,\n" + "\t \\[\\]\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1e7e0618577d..dfd096031305 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -341,9 +341,9 @@ static int __parse_bitfield_probe_arg(const char *bf, int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe) { - struct fetch_insn *code, *tmp = NULL; - const char *t; - int ret; + struct fetch_insn *code, *scode, *tmp = NULL; + char *t, *t2; + int ret, len; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -354,24 +354,42 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, pr_info("Failed to allocate memory for command '%s'.\n", arg); return -ENOMEM; } - t = strchr(parg->comm, ':'); + t = strchr(arg, ':'); if (t) { - arg[t - parg->comm] = '\0'; - t++; + *t = '\0'; + t2 = strchr(++t, '['); + if (t2) { + *t2 = '\0'; + parg->count = simple_strtoul(t2 + 1, &t2, 0); + if (strcmp(t2, "]") || parg->count == 0) + return -EINVAL; + if (parg->count > MAX_ARRAY_LEN) + return -E2BIG; + } } /* * The default type of $comm should be "string", and it can't be * dereferenced. */ if (!t && strcmp(arg, "$comm") == 0) - t = "string"; - parg->type = find_fetch_type(t); + parg->type = find_fetch_type("string"); + else + parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; } parg->offset = *size; - *size += parg->type->size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) + return -ENOMEM; + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); if (!code) @@ -391,10 +409,20 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, ret = -EINVAL; goto fail; } - /* Since IMM or COMM must be the 1st insn, this is safe */ - if (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) + if (code->op != FETCH_OP_DEREF || parg->count) { + /* + * IMM and COMM is pointing actual address, those must + * be kept, and if parg->count != 0, this is an array + * of string pointers instead of string address itself. + */ code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + } code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + code->size = parg->type->size; parg->dynamic = true; } else if (code->op == FETCH_OP_DEREF) { code->op = FETCH_OP_ST_MEM; @@ -408,12 +436,29 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, code->op = FETCH_OP_ST_RAW; code->size = parg->type->size; } + scode = code; /* Modify operation */ if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); if (ret) goto fail; } + /* Loop(Array) operation */ + if (parg->count) { + if (scode->op != FETCH_OP_ST_MEM && + scode->op != FETCH_OP_ST_STRING) { + pr_info("array only accepts memory or address\n"); + ret = -EINVAL; + goto fail; + } + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_LP_ARRAY; + code->param = parg->count; + } code++; code->op = FETCH_OP_END; @@ -452,14 +497,17 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->code); kfree(arg->name); kfree(arg->comm); + kfree(arg->fmt); } +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { - int i; + struct probe_arg *parg; + int i, j; int pos = 0; - const char *fmt, *arg; if (!is_return) { @@ -470,33 +518,49 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; } - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); + parg = tp->args + i; + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name); + if (parg->count) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s", + parg->type->fmt); + for (j = 1; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s", + parg->type->fmt); + pos += snprintf(buf + pos, LEN_OR_ZERO, "}"); + } else + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", + parg->type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) + parg = tp->args + i; + if (parg->count) { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s[%d])"; + else + fmt = ", REC->%s[%d]"; + for (j = 0; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name, j); + } else { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s)"; + else + fmt = ", REC->%s"; pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + fmt, parg->name); + } } -#undef LEN_OR_ZERO - /* return the length of print_fmt */ return pos; } +#undef LEN_OR_ZERO int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { @@ -524,11 +588,15 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { struct probe_arg *parg = &tp->args[i]; - - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - offset + parg->offset, - parg->type->size, + const char *fmt = parg->type->fmttype; + int size = parg->type->size; + + if (parg->fmt) + fmt = parg->fmt; + if (parg->count) + size *= parg->count; + ret = trace_define_field(event_call, fmt, parg->name, + offset + parg->offset, size, parg->type->is_signed, FILTER_OTHER); if (ret) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 469110e0790b..1f456fd82483 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -30,6 +30,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 +#define MAX_ARRAY_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -65,6 +66,14 @@ static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) return (u8 *)ent + get_loc_offs(*dl); } +static nokprobe_inline u32 update_data_loc(u32 loc, int consumed) +{ + u32 maxlen = get_loc_len(loc); + u32 offset = get_loc_offs(loc); + + return make_data_loc(maxlen - consumed, offset + consumed); +} + /* Printing function type */ typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); @@ -86,6 +95,8 @@ enum fetch_op { FETCH_OP_ST_STRING, /* String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + // Stage 5 (loop) op + FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ FETCH_OP_END, }; @@ -175,6 +186,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) #define ASSIGN_FETCH_TYPE_END {} +#define MAX_ARRAY_LEN 64 #ifdef CONFIG_KPROBE_EVENTS bool trace_kprobe_on_func_entry(struct trace_event_call *call); @@ -195,8 +207,10 @@ struct probe_arg { struct fetch_insn *code; bool dynamic;/* Dynamic array (string) is used */ unsigned int offset; /* Offset from argument entry */ + unsigned int count; /* Array count */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ + char *fmt; /* Format string if needed */ const struct fetch_type *type; /* Type of this argument */ }; diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index b4075f3e3a29..5c56afc17cf8 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -67,10 +67,15 @@ static nokprobe_inline int process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, void *dest, void *base) { - int ret = 0; + struct fetch_insn *s3 = NULL; + int total = 0, ret = 0, i = 0; + u32 loc = 0; + unsigned long lval = val; +stage2: /* 2nd stage: dereference memory if needed */ while (code->op == FETCH_OP_DEREF) { + lval = val; ret = probe_mem_read(&val, (void *)val + code->offset, sizeof(val)); if (ret) @@ -78,11 +83,15 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, code++; } + s3 = code; +stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) - return fetch_store_strlen(val + code->offset); - else + if (code->op == FETCH_OP_ST_STRING) { + ret += fetch_store_strlen(val + code->offset); + code++; + goto array; + } else return -EILSEQ; } @@ -94,6 +103,7 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, probe_mem_read(dest, (void *)val + code->offset, code->size); break; case FETCH_OP_ST_STRING: + loc = *(u32 *)dest; ret = fetch_store_string(val + code->offset, dest, base); break; default: @@ -107,6 +117,29 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, code++; } +array: + /* the last stage: Loop on array */ + if (code->op == FETCH_OP_LP_ARRAY) { + total += ret; + if (++i < code->param) { + code = s3; + if (s3->op != FETCH_OP_ST_STRING) { + dest += s3->size; + val += s3->size; + goto stage3; + } + code--; + val = lval + sizeof(char *); + if (dest) { + dest += sizeof(u32); + *(u32 *)dest = update_data_loc(loc, ret); + } + goto stage2; + } + code++; + ret = total; + } + return code->op == FETCH_OP_END ? ret : -EILSEQ; } @@ -158,12 +191,26 @@ static inline int print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, u8 *data, void *field) { - int i; + void *p; + int i, j; for (i = 0; i < nr_args; i++) { - trace_seq_printf(s, " %s=", args[i].name); - if (!args[i].type->print(s, data + args[i].offset, field)) - return -ENOMEM; + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } } return 0; } -- cgit v1.2.3 From a1303af5d79eb13a658633a9fb0ce3aed0f7decf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 25 Apr 2018 21:21:26 +0900 Subject: tracing: probeevent: Add $argN for accessing function args Add $argN special fetch variable for accessing function arguments. This allows user to trace the Nth argument easily at the function entry. Note that this returns most probably assignment of registers and stacks. In some case, it may not work well. If you need to access correct registers or stacks you should use perf-probe. Link: http://lkml.kernel.org/r/152465888632.26224.3412465701570253696.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 10 ++++++---- kernel/trace/trace.c | 4 ++++ kernel/trace/trace_kprobe.c | 18 +++++++++++++----- kernel/trace/trace_probe.c | 36 +++++++++++++++++++++++------------- kernel/trace/trace_probe.h | 9 ++++++++- kernel/trace/trace_uprobe.c | 2 +- 6 files changed, 55 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 2dfed7a1ea6f..47e765c2f2c3 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -45,16 +45,18 @@ Synopsis of kprobe_events @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. - $retval : Fetch return value.(*) + $argN : Fetch the Nth function argument. (N >= 1) (\*1) + $retval : Fetch return value.(\*2) $comm : Fetch current task comm. - +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) + +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(\*3) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types (x8/x16/x32/x64), "string" and bitfield are supported. - (*) only for return probe. - (**) this is useful for fetching a field of data structures. + (\*1) only for the probe on function entry (offs == 0). + (\*2) only for return probe. + (\*3) this is useful for fetching a field of data structures. Types ----- diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e7f99f513959..ec5b21778806 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4625,7 +4625,11 @@ static const char readme_msg[] = #endif "\t args: =fetcharg[:type]\n" "\t fetcharg: %, @
, @[+|-],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + "\t $stack, $stack, $retval, $comm, $arg\n" +#else "\t $stack, $stack, $retval, $comm\n" +#endif "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fdd43f2f1fd1..3faaadbddf54 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -533,13 +533,15 @@ static int create_trace_kprobe(int argc, char **argv) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = false; - else if (argv[0][0] == 'r') + else if (argv[0][0] == 'r') { is_return = true; - else if (argv[0][0] == '-') + flags |= TPARG_FL_RETURN; + } else if (argv[0][0] == '-') is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" @@ -625,8 +627,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse either an address or a symbol.\n"); return ret; } - if (offset && is_return && - !kprobe_on_func_entry(NULL, symbol, offset)) { + if (kprobe_on_func_entry(NULL, symbol, offset)) + flags |= TPARG_FL_FENTRY; + if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } @@ -696,7 +699,7 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true); + flags); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -932,6 +935,11 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_COMM: val = (unsigned long)current->comm; break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif default: return -EILSEQ; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfd096031305..333cda6d2633 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -157,14 +157,13 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, bool is_return, - bool is_kprobe) + struct fetch_insn *code, unsigned int flags) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) + if (flags & TPARG_FL_RETURN) code->op = FETCH_OP_RETVAL; else ret = -EINVAL; @@ -173,7 +172,8 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + if (ret || ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK)) ret = -EINVAL; else { code->op = FETCH_OP_STACK; @@ -183,6 +183,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, ret = -EINVAL; } else if (strcmp(arg, "comm") == 0) { code->op = FETCH_OP_COMM; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + } else if (((flags & TPARG_FL_MASK) == + (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + strncmp(arg, "arg", 3) == 0) { + if (!isdigit(arg[3])) + return -EINVAL; + ret = kstrtoul(arg + 3, 10, ¶m); + if (ret || !param || param > PARAM_MAX_STACK) + return -EINVAL; + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param - 1; +#endif } else ret = -EINVAL; @@ -193,7 +205,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - bool is_return, bool is_kprobe) + unsigned int flags) { struct fetch_insn *code = *pcode; unsigned long param; @@ -203,8 +215,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, - is_return, is_kprobe); + ret = parse_probe_vars(arg + 1, type, code, flags); break; case '%': /* named register */ @@ -226,7 +237,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (is_kprobe) + if (flags & TPARG_FL_KERNEL) return -EINVAL; ret = kstrtol(arg + 2, 0, &offset); @@ -237,7 +248,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!is_kprobe) + if (!(flags & TPARG_FL_KERNEL)) return -EINVAL; ret = traceprobe_split_symbol_offset(arg + 1, &offset); @@ -278,8 +289,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, is_return, - is_kprobe); + ret = parse_probe_arg(arg, t2, &code, end, flags); if (ret) break; if (code->op == FETCH_OP_COMM) @@ -339,7 +349,7 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe) + struct probe_arg *parg, unsigned int flags) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2; @@ -397,7 +407,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - is_return, is_kprobe); + flags); if (ret) goto fail; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 1f456fd82483..09f62171cc23 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "trace.h" @@ -86,6 +87,7 @@ enum fetch_op { FETCH_OP_RETVAL, /* Return value */ FETCH_OP_IMM, /* Immediate : .immediate */ FETCH_OP_COMM, /* Current comm */ + FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ @@ -263,8 +265,13 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) return NULL; } +#define TPARG_FL_RETURN BIT(0) +#define TPARG_FL_KERNEL BIT(1) +#define TPARG_FL_FENTRY BIT(2) +#define TPARG_FL_MASK GENMASK(2, 0) + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe); + struct probe_arg *parg, unsigned int flags); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7154473ffaa4..394b93572506 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,7 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false); + is_return ? TPARG_FL_RETURN : 0); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; -- cgit v1.2.3 From f3f58935edbcb33fd529fc46d554162a0660fd2d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 29 Aug 2018 01:17:47 +0900 Subject: tracing/uprobes: Fix to return -EFAULT if copy_from_user failed Fix probe_mem_read() to return -EFAULT if copy_from_user() failed. The copy_from_user() returns remaining bytes when it failed, but probe_mem_read() caller expects it returns error code like as probe_kernel_read(). Link: http://lkml.kernel.org/r/153547306719.26502.8353484532699160223.stgit@devbox Reported-by: Dan Carpenter Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 394b93572506..31ea48eceda1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -105,7 +105,7 @@ probe_mem_read(void *dest, void *src, size_t size) { void __user *vaddr = (void __force __user *)src; - return copy_from_user(dest, vaddr, size); + return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max -- cgit v1.2.3 From 59158ec4aef7d44be51a6f3e7e17fc64c32604eb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 29 Aug 2018 01:18:15 +0900 Subject: tracing/kprobes: Check the probe on unloaded module correctly Current kprobe event doesn't checks correctly whether the given event is on unloaded module or not. It just checks the event has ":" in the name. That is not enough because if we define a probe on non-exist symbol on loaded module, it allows to define that (with warning message) To ensure it correctly, this searches the module name on loaded module list and only if there is not, it allows to define it. (this event will be available when the target module is loaded) Link: http://lkml.kernel.org/r/153547309528.26502.8300278470528281328.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3faaadbddf54..4727a13824f0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -62,9 +62,23 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { - return !!strchr(trace_kprobe_symbol(tk), ':'); + char *p; + bool ret; + + if (!tk->symbol) + return false; + p = strchr(tk->symbol, ':'); + if (!p) + return true; + *p = '\0'; + mutex_lock(&module_mutex); + ret = !!find_module(tk->symbol); + mutex_unlock(&module_mutex); + *p = ':'; + + return ret; } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -374,19 +388,13 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) + if (ret == 0) { tk->tp.flags |= TP_FLAG_REGISTERED; - else { - if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); - ret = 0; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } + } else if (ret == -EILSEQ) { + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); + ret = -EINVAL; } - return ret; } @@ -449,6 +457,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register k*probe */ ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + if (ret < 0) unregister_kprobe_event(tk); else -- cgit v1.2.3 From a6682814f37124ec1e708cca8f44968445fa9dd7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 29 Aug 2018 01:18:43 +0900 Subject: tracing/kprobes: Allow kprobe-events to record module symbol Allow kprobe-events to record module symbols. Since data symbols in a non-loaded module doesn't exist, it fails to define such symbol as an argument of kprobe-event. But if the kprobe event is defined on that module, we can defer to resolve the symbol address. Note that if given symbol is not found, the event is kept unavailable. User can enable it but the event is not recorded. Link: http://lkml.kernel.org/r/153547312336.26502.11432902826345374463.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 12 ++++++++- kernel/trace/trace_probe.c | 62 +++++++++++++++++++++++++++++++++++++++------ kernel/trace/trace_probe.h | 4 ++- 3 files changed, 68 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4727a13824f0..fec67188c4d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -366,7 +366,7 @@ static bool within_notrace_func(struct trace_kprobe *tk) /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { - int ret; + int i, ret; if (trace_probe_is_registered(&tk->tp)) return -EINVAL; @@ -377,6 +377,12 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; } + for (i = 0; i < tk->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tk->tp.args[i]); + if (ret) + return ret; + } + /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(&tk->tp)) tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; @@ -928,6 +934,7 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, { unsigned long val; +retry: /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_REG: @@ -953,6 +960,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, val = regs_get_kernel_argument(regs, code->param); break; #endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; default: return -EILSEQ; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 333cda6d2633..5b3d573b3dcf 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -251,16 +251,16 @@ parse_probe_arg(char *arg, const struct fetch_type *type, if (!(flags & TPARG_FL_KERNEL)) return -EINVAL; - ret = traceprobe_split_symbol_offset(arg + 1, &offset); - if (ret) - break; + /* Preserve symbol for updating */ + code->op = FETCH_NOP_SYMBOL; + code->data = kstrdup(arg + 1, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + if (++code == end) + return -E2BIG; code->op = FETCH_OP_IMM; - code->immediate = - (unsigned long)kallsyms_lookup_name(arg + 1); - if (!code->immediate) - return -ENOENT; - code->immediate += offset; + code->immediate = 0; } /* These are fetching from memory */ if (++code == end) @@ -480,6 +480,11 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); fail: + if (ret) { + for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + } kfree(tmp); return ret; @@ -504,12 +509,53 @@ int traceprobe_conflict_field_name(const char *name, void traceprobe_free_probe_arg(struct probe_arg *arg) { + struct fetch_insn *code = arg->code; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + code++; + } kfree(arg->code); kfree(arg->name); kfree(arg->comm); kfree(arg->fmt); } +int traceprobe_update_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + long offset; + char *tmp; + char c; + int ret = 0; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) { + if (code[1].op != FETCH_OP_IMM) + return -EINVAL; + + tmp = strpbrk("+-", code->data); + if (tmp) + c = *tmp; + ret = traceprobe_split_symbol_offset(code->data, + &offset); + if (ret) + return ret; + + code[1].immediate = + (unsigned long)kallsyms_lookup_name(code->data); + if (tmp) + *tmp = c; + if (!code[1].immediate) + return -ENOENT; + code[1].immediate += offset; + } + code++; + } + return 0; +} + /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 09f62171cc23..974afc1a3e73 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -100,6 +100,7 @@ enum fetch_op { // Stage 5 (loop) op FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ FETCH_OP_END, + FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; struct fetch_insn { @@ -116,6 +117,7 @@ struct fetch_insn { unsigned char rshift; }; unsigned long immediate; + void *data; }; }; @@ -276,7 +278,7 @@ extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); -extern void traceprobe_update_arg(struct probe_arg *arg); +extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -- cgit v1.2.3 From bf173ca92da97863e1579a982d500da98f2e7a3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 12 Oct 2018 12:50:22 -0400 Subject: tracing: probeevent: Fix uninitialized used of offset in parse args Dan's smatch utility found an uninitialized use of offset in a path in parse_probe_args(). Unless an offset is specifically specified for commands that allow them, it should default to zero. Link: http://lkml.kernel.org/r/20181012134246.5doqaobxunlqqs53@mwanda Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Reported-by: Dan Carpenter (smatch) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5b3d573b3dcf..3ef15a6683c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -209,7 +209,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, { struct fetch_insn *code = *pcode; unsigned long param; - long offset; + long offset = 0; char *tmp; int ret = 0; -- cgit v1.2.3 From bcfa4b72111c9a4d483024cb1f877803b354aa11 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 15 Aug 2018 14:22:16 -0400 Subject: memremap: Convert to XArray Use the new xa_store_range function instead of the radix tree. Signed-off-by: Matthew Wilcox --- kernel/memremap.c | 76 ++++++++++++------------------------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..e842fab9f184 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,47 +1,21 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include #include -#include -#include #include #include -#include #include +#include +#include #include #include +#include #include +#include -static DEFINE_MUTEX(pgmap_lock); -static RADIX_TREE(pgmap_radix, GFP_KERNEL); +static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -static unsigned long order_at(struct resource *res, unsigned long pgoff) -{ - unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; - unsigned long nr_pages, mask; - - nr_pages = PHYS_PFN(resource_size(res)); - if (nr_pages == pgoff) - return ULONG_MAX; - - /* - * What is the largest aligned power-of-2 range available from - * this resource pgoff to the end of the resource range, - * considering the alignment of the current pgoff? - */ - mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); - if (!mask) - return ULONG_MAX; - - return find_first_bit(&mask, BITS_PER_LONG); -} - -#define foreach_order_pgoff(res, order, pgoff) \ - for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ - pgoff += 1UL << order, order = order_at((res), pgoff)) - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) +static void pgmap_array_delete(struct resource *res) { - unsigned long pgoff, order; - - mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) { - if (pgoff >= end_pgoff) - break; - radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); - } - mutex_unlock(&pgmap_lock); - + xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + NULL, GFP_KERNEL); synchronize_rcu(); } @@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res, -1); + pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -175,7 +141,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + unsigned long pfn; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; struct dev_pagemap *conflict_pgmap; @@ -216,20 +182,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap->dev = dev; - mutex_lock(&pgmap_lock); - error = 0; - - foreach_order_pgoff(res, order, pgoff) { - error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, pgmap); - if (error) { - dev_err(dev, "%s: failed: %d\n", __func__, error); - break; - } - } - mutex_unlock(&pgmap_lock); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), + PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) - goto err_radix; + goto err_array; nid = dev_to_node(dev); if (nid < 0) @@ -279,8 +235,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: - err_radix: - pgmap_radix_release(res, pgoff); + pgmap_array_delete(res); + err_array: return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -320,7 +276,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); -- cgit v1.2.3 From d7b31359ecef8d32540266f39d99892f61d17c4b Mon Sep 17 00:00:00 2001 From: Lénaïc Huard Date: Thu, 22 Mar 2018 23:53:05 +0100 Subject: kvm_config: add CONFIG_VIRTIO_MENU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that make kvmconfig enables all the virtio drivers even if it is preceded by a make allnoconfig. Signed-off-by: Lénaïc Huard Signed-off-by: Michael S. Tsirkin --- kernel/configs/kvm_guest.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 108fecc20fc1..208481d91090 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -20,6 +20,7 @@ CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y CONFIG_S390_GUEST=y CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y CONFIG_VIRTIO_CONSOLE=y -- cgit v1.2.3 From 145d952a29320dea883246bcb24ba1da7ac4bb7f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 4 Oct 2018 14:04:02 +0200 Subject: sched: Factor out nr_iowait and nr_iowait_cpu The function nr_iowait_cpu() can be used directly by nr_iowait() instead of duplicating code. Call nr_iowait_cpu() from nr_iowait() Signed-off-by: Daniel Lezcano Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/core.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe0223121883..9245c56b8f5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2875,6 +2875,25 @@ unsigned long long nr_context_switches(void) return sum; } +/* + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. + */ + +unsigned long nr_iowait_cpu(int cpu) +{ + return atomic_read(&cpu_rq(cpu)->nr_iowait); +} + +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; +} + /* * IO-wait accounting, and how its mostly bollocks (on SMP). * @@ -2910,31 +2929,11 @@ unsigned long nr_iowait(void) unsigned long i, sum = 0; for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); + sum += nr_iowait_cpu(i); return sum; } -/* - * Consumers of these two interfaces, like for example the cpuidle menu - * governor, are using nonsensical data. Preferring shallow idle state selection - * for a CPU that has IO-wait which might not even end up running the task when - * it does become runnable. - */ - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->load.weight; -} - #ifdef CONFIG_SMP /* -- cgit v1.2.3 From a7fe5190c03f8137ef08db84a58dd4daf2c4785d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 4 Oct 2018 14:04:03 +0200 Subject: cpuidle: menu: Remove get_loadavg() from the performance multiplier The function get_loadavg() returns almost always zero. To be more precise, statistically speaking for a total of 1023379 times passing in the function, the load is equal to zero 1020728 times, greater than 100, 610 times, the remaining is between 0 and 5. In 2011, the get_loadavg() was removed from the Android tree because of the above [1]. At this time, the load was: unsigned long this_cpu_load(void) { struct rq *this = this_rq(); return this->cpu_load[0]; } In 2014, the code was changed by commit 372ba8cb46b2 (cpuidle: menu: Lookup CPU runqueues less) and the load is: void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); *nr_waiters = atomic_read(&rq->nr_iowait); *load = rq->load.weight; } with the same result. Both measurements show using the load in this code path does no matter anymore. Removing it. [1] https://android.googlesource.com/kernel/common/+/4dedd9f124703207895777ac6e91dacde0f7cc17 Signed-off-by: Daniel Lezcano Acked-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 25 ++++++------------------- include/linux/sched/stat.h | 1 - kernel/sched/core.c | 7 ------- 3 files changed, 6 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 575a68f31761..76df4f947f07 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -134,11 +134,6 @@ struct menu_device { #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -static inline int get_loadavg(unsigned long load) -{ - return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; -} - static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters) { int bucket = 0; @@ -172,18 +167,10 @@ static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ -static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load) +static inline int performance_multiplier(unsigned long nr_iowaiters) { - int mult = 1; - - /* for higher loadavg, we are more reluctant */ - - mult += 2 * get_loadavg(load); - - /* for IO wait tasks (per cpu!) we add 5x each */ - mult += 10 * nr_iowaiters; - - return mult; + /* for IO wait tasks (per cpu!) we add 10x each */ + return 1 + 10 * nr_iowaiters; } static DEFINE_PER_CPU(struct menu_device, menu_devices); @@ -301,7 +288,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int idx; unsigned int interactivity_req; unsigned int predicted_us; - unsigned long nr_iowaiters, cpu_load; + unsigned long nr_iowaiters; ktime_t delta_next; if (data->needs_update) { @@ -312,7 +299,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* determine the expected residency time, round up */ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next)); - get_iowait_load(&nr_iowaiters, &cpu_load); + nr_iowaiters = nr_iowait_cpu(dev->cpu); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); if (unlikely(drv->state_count <= 1 || latency_req == 0) || @@ -356,7 +343,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * Use the performance multiplier and the user-configurable * latency_req to determine the maximum exit latency. */ - interactivity_req = predicted_us / performance_multiplier(nr_iowaiters, cpu_load); + interactivity_req = predicted_us / performance_multiplier(nr_iowaiters); if (latency_req > interactivity_req) latency_req = interactivity_req; } diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 04f1321d14c4..f30954cc059d 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -20,7 +20,6 @@ extern unsigned long nr_running(void); extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); -extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); static inline int sched_info_on(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9245c56b8f5f..c8d5c279be14 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2887,13 +2887,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&cpu_rq(cpu)->nr_iowait); } -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->load.weight; -} - /* * IO-wait accounting, and how its mostly bollocks (on SMP). * -- cgit v1.2.3 From a1c6ca3c6de763459a6e93b644ec6518c890ba1c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 17 Oct 2018 13:23:55 +0200 Subject: kernel: hung_task.c: disable on suspend It is possible to observe hung_task complaints when system goes to suspend-to-idle state: # echo freeze > /sys/power/state PM: Syncing filesystems ... done. Freezing user space processes ... (elapsed 0.001 seconds) done. OOM killer disabled. Freezing remaining freezable tasks ... (elapsed 0.002 seconds) done. sd 0:0:0:0: [sda] Synchronizing SCSI cache INFO: task bash:1569 blocked for more than 120 seconds. Not tainted 4.19.0-rc3_+ #687 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. bash D 0 1569 604 0x00000000 Call Trace: ? __schedule+0x1fe/0x7e0 schedule+0x28/0x80 suspend_devices_and_enter+0x4ac/0x750 pm_suspend+0x2c0/0x310 Register a PM notifier to disable the detector on suspend and re-enable back on wakeup. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Rafael J. Wysocki --- kernel/hung_task.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b9132d1269ef..cb8e3e8ac7b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -242,6 +243,28 @@ void reset_hung_task_detector(void) } EXPORT_SYMBOL_GPL(reset_hung_task_detector); +static bool hung_detector_suspended; + +static int hungtask_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + hung_detector_suspended = true; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + hung_detector_suspended = false; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * kthread which checks for tasks stuck in D state */ @@ -261,7 +284,8 @@ static int watchdog(void *dummy) interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { - if (!atomic_xchg(&reset_hung_task, 0)) + if (!atomic_xchg(&reset_hung_task, 0) && + !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; @@ -275,6 +299,10 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + /* Disable hung task detector on suspend */ + pm_notifier(hungtask_pm_notify, 0); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; -- cgit v1.2.3 From f592f804831f1cf9d1f9966f58c80f150e6829b5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 24 Oct 2018 20:15:17 +0900 Subject: bpf: devmap: fix wrong interface selection in notifier_call The dev_map_notification() removes interface in devmap if unregistering interface's ifindex is same. But only checking ifindex is not enough because other netns can have same ifindex. so that wrong interface selection could occurred. Hence netdev pointer comparison code is added. v2: compare netdev pointer instead of using net_eq() (Daniel Borkmann) v1: Initial patch Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map") Signed-off-by: Taehee Yoo Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 141710b82a6c..191b79948424 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier, struct bpf_dtab_netdev *dev, *odev; dev = READ_ONCE(dtab->netdev_map[i]); - if (!dev || - dev->dev->ifindex != netdev->ifindex) + if (!dev || netdev != dev->dev) continue; odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); if (dev == odev) -- cgit v1.2.3 From 4a6998aff82a20a1aece86a186d8e5263f8b2315 Mon Sep 17 00:00:00 2001 From: Martin Lau Date: Wed, 24 Oct 2018 20:42:25 +0000 Subject: bpf, btf: fix a missing check bug in btf_parse Wenwen Wang reported: In btf_parse(), the header of the user-space btf data 'btf_data' is firstly parsed and verified through btf_parse_hdr(). In btf_parse_hdr(), the header is copied from user-space 'btf_data' to kernel-space 'btf->hdr' and then verified. If no error happens during the verification process, the whole data of 'btf_data', including the header, is then copied to 'data' in btf_parse(). It is obvious that the header is copied twice here. More importantly, no check is enforced after the second copy to make sure the headers obtained in these two copies are same. Given that 'btf_data' resides in the user space, a malicious user can race to modify the header between these two copies. By doing so, the user can inject inconsistent data, which can cause undefined behavior of the kernel and introduce potential security risk. This issue is similar to the one fixed in commit 8af03d1ae2e1 ("bpf: btf: Fix a missing check bug"). To fix it, this patch copies the user 'btf_data' *before* parsing / verifying the BTF header. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Co-developed-by: Wenwen Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 58 ++++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 378cef70341c..ee4c82667d65 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2067,56 +2067,47 @@ static int btf_check_sec_info(struct btf_verifier_env *env, return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, - u32 btf_data_size) +static int btf_parse_hdr(struct btf_verifier_env *env) { + u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; - u32 hdr_len, hdr_copy; - /* - * Minimal part of the "struct btf_header" that - * contains the hdr_len. - */ - struct btf_min_header { - u16 magic; - u8 version; - u8 flags; - u32 hdr_len; - } __user *min_hdr; struct btf *btf; int err; btf = env->btf; - min_hdr = btf_data; + btf_data_size = btf->data_size; - if (btf_data_size < sizeof(*min_hdr)) { + if (btf_data_size < + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } - if (get_user(hdr_len, &min_hdr->hdr_len)) - return -EFAULT; - + hdr = btf->data; + hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); - if (err) { - if (err == -E2BIG) - btf_verifier_log(env, "Unsupported btf_header"); - return err; + /* Ensure the unsupported header fields are zero */ + if (hdr_len > sizeof(btf->hdr)) { + u8 *expected_zero = btf->data + sizeof(btf->hdr); + u8 *end = btf->data + hdr_len; + + for (; expected_zero < end; expected_zero++) { + if (*expected_zero) { + btf_verifier_log(env, "Unsupported btf_header"); + return -E2BIG; + } + } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) - return -EFAULT; + memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; - if (hdr->hdr_len != hdr_len) - return -EINVAL; - btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { @@ -2186,10 +2177,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, } env->btf = btf; - err = btf_parse_hdr(env, btf_data, btf_data_size); - if (err) - goto errout; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; @@ -2198,13 +2185,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + err = btf_parse_str_sec(env); if (err) goto errout; -- cgit v1.2.3 From 5d66fa7d9e9e9399ddfdc530f352dd6f7c724485 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:45 +0200 Subject: bpf: fix direct packet access for flow dissector progs Commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") added direct packet access for skbs in may_access_direct_pkt_data() function where this enables read and write access to the skb->data. This is buggy because without a prologue generator such as bpf_unclone_prologue() we would allow for writing into cloned skbs. Original intention might have been to only allow read access where this is not needed (similar as the flow_dissector_func_proto() indicates which enables only bpf_skb_load_bytes() as well), therefore this patch fixes it to restrict to read-only. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Petar Penkov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98fa0be35370..b0cc8f2ff95f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1387,21 +1387,23 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, enum bpf_access_type t) { switch (env->prog->type) { + /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: - /* dst_input() and dst_output() can't write for now */ + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (t == BPF_WRITE) return false; /* fallthrough */ + + /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; -- cgit v1.2.3 From d5563d367c2ce48ea3d675c77f7109f37311943d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:46 +0200 Subject: bpf: fix cg_skb types to hint access type in may_access_direct_pkt_data Commit b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") added direct packet access for skbs in cg_skb program types, however allowed access type was not added to the may_access_direct_pkt_data() helper. Therefore the latter always returns false. This is not directly an issue, it just means writes are unconditionally disabled (which is correct) but also reads. Latter is relevant in this function when BPF helpers may read direct packet data which is unconditionally disabled then. Fix it by properly adding BPF_PROG_TYPE_CGROUP_SKB to may_access_direct_pkt_data(). Fixes: b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0cc8f2ff95f..5fc9a658af0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1393,6 +1393,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; /* fallthrough */ -- cgit v1.2.3 From 80b0d86a176cab6201719b8dfd806902b0c6e046 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:47 +0200 Subject: bpf: fix direct packet write into pop/peek helpers Commit f1a2e44a3aec ("bpf: add queue and stack maps") probably just copy-pasted .pkt_access for bpf_map_{pop,peek}_elem() helpers, but this is buggy in this context since it would allow writes into cloned skbs which is invalid. Therefore, disable .pkt_access for the two. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ab0d5e3f9892..a74972b07e74 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, -- cgit v1.2.3 From d3f66e4116aff8dd0d5bd4067295b9ddb5e2c29c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:48 +0200 Subject: bpf: fix leaking uninitialized memory on pop/peek helpers Commit f1a2e44a3aec ("bpf: add queue and stack maps") added helpers with ARG_PTR_TO_UNINIT_MAP_VALUE. Meaning, the helper is supposed to fill the map value buffer with data instead of reading from it like in other helpers such as map update. However, given the buffer is allowed to be uninitialized (since we fill it in the helper anyway), it also means that the helper is obliged to wipe the memory in case of an error in order to not allow for leaking uninitialized memory. Given pop/peek is both handled inside __{stack,queue}_map_get(), lets wipe it there on error case, that is, empty stack/queue. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- kernel/bpf/queue_stack_maps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..8bbd72d3a121 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } -- cgit v1.2.3 From b09928b976280d64060d7bee146d7df5c5a29bef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:49 +0200 Subject: bpf: make direct packet write unclone more robust Given this seems to be quite fragile and can easily slip through the cracks, lets make direct packet write more robust by requiring that future program types which allow for such write must provide a prologue callback. In case of XDP and sk_msg it's noop, thus add a generic noop handler there. The latter starts out with NULL data/data_end unconditionally when sg pages are shared. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fc9a658af0e..171a2c88e77d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5709,7 +5709,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bool is_narrower_load; u32 target_size; - if (ops->gen_prologue) { + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { diff --git a/net/core/filter.c b/net/core/filter.c index 3fdddfa9a0fd..cd648d09a8e5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5644,6 +5644,15 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -7210,6 +7219,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -7308,6 +7318,7 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { -- cgit v1.2.3 From ede95a63b5e84ddeea6b0c473b36ab8bfd8c6ce3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Oct 2018 01:11:04 +0200 Subject: bpf: add bpf_jit_limit knob to restrict unpriv allocations Rick reported that the BPF JIT could potentially fill the entire module space with BPF programs from unprivileged users which would prevent later attempts to load normal kernel modules or privileged BPF programs, for example. If JIT was enabled but unsuccessful to generate the image, then before commit 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") we would always fall back to the BPF interpreter. Nowadays in the case where the CONFIG_BPF_JIT_ALWAYS_ON could be set, then the load will abort with a failure since the BPF interpreter was compiled out. Add a global limit and enforce it for unprivileged users such that in case of BPF interpreter compiled out we fail once the limit has been reached or we fall back to BPF interpreter earlier w/o using module mem if latter was compiled in. In a next step, fair share among unprivileged users can be resolved in particular for the case where we would fail hard once limit is reached. Fixes: 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64") Co-Developed-by: Rick Edgecombe Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Eric Dumazet Cc: Jann Horn Cc: Kees Cook Cc: LKML Signed-off-by: Alexei Starovoitov --- Documentation/sysctl/net.txt | 8 ++++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 49 +++++++++++++++++++++++++++++++++++++++++--- net/core/sysctl_net_core.c | 10 +++++++-- 4 files changed, 63 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 9ecde517728c..2793d4eac55f 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -92,6 +92,14 @@ Values : 0 - disable JIT kallsyms export (default value) 1 - enable JIT kallsyms export for privileged users only +bpf_jit_limit +------------- + +This enforces a global limit for memory allocations to the BPF JIT +compiler in order to reject unprivileged JIT requests once it has +been surpassed. bpf_jit_limit contains the value of the global limit +in bytes. + dev_weight -------------- diff --git a/include/linux/filter.h b/include/linux/filter.h index 91b4c934f02e..de629b706d1d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -854,6 +854,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; +extern int bpf_jit_limit; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c7eeea8cffc..6377225b2082 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -365,10 +365,13 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) } #ifdef CONFIG_BPF_JIT +# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) + /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; +int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -577,27 +580,64 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +static atomic_long_t bpf_jit_current; + +#if defined(MODULES_VADDR) +static int __init bpf_jit_charge_init(void) +{ + /* Only used as heuristic here to derive limit. */ + bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, + PAGE_SIZE), INT_MAX); + return 0; +} +pure_initcall(bpf_jit_charge_init); +#endif + +static int bpf_jit_charge_modmem(u32 pages) +{ + if (atomic_long_add_return(pages, &bpf_jit_current) > + (bpf_jit_limit >> PAGE_SHIFT)) { + if (!capable(CAP_SYS_ADMIN)) { + atomic_long_sub(pages, &bpf_jit_current); + return -EPERM; + } + } + + return 0; +} + +static void bpf_jit_uncharge_modmem(u32 pages) +{ + atomic_long_sub(pages, &bpf_jit_current); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - unsigned int size, hole, start; + u32 size, hole, start, pages; /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + pages = size / PAGE_SIZE; + + if (bpf_jit_charge_modmem(pages)) + return NULL; hdr = module_alloc(size); - if (hdr == NULL) + if (!hdr) { + bpf_jit_uncharge_modmem(pages); return NULL; + } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = size / PAGE_SIZE; + hdr->pages = pages; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -610,7 +650,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { + u32 pages = hdr->pages; + module_memfree(hdr); + bpf_jit_uncharge_modmem(pages); } /* This symbol is only overridden by archs that have different diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b1a2c5e38530..37b4667128a3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -279,7 +279,6 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, return ret; } -# ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -290,7 +289,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -# endif #endif static struct ctl_table net_core_table[] = { @@ -397,6 +395,14 @@ static struct ctl_table net_core_table[] = { .extra2 = &one, }, # endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &one, + }, #endif { .procname = "netdev_tstamp_prequeue", -- cgit v1.2.3 From da387e5c930f43d9f3b011a6fbb33bdf43d9714c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 17 Oct 2018 09:51:43 +0300 Subject: tracing: Export trace_dump_stack to modules There is no reason for this function to be unexprted and it's a useful debugging aid. Link: http://lkml.kernel.org/r/1539759103-5923-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ec5b21778806..ff1c4b20cd0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2727,6 +2727,7 @@ void trace_dump_stack(int skip) __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } +EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); -- cgit v1.2.3 From a2acce536921bd793bae13fa344fcea157638e72 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 15 Oct 2018 23:14:28 -0400 Subject: tracing: Have stack tracer trace full stack The stack tracer traces every function call checking the current stack (in non interrupt context), looking for the deepest stack, and saving it when it finds a new max depth. The problem is that it calls save_stack_trace(), and with the new ORC unwinder, it can skip too much. As it looks at the ip of the function call in the backtrace to find where it should start, it doesn't need to skip anything. The stack trace selftest would fail when the kernel was complied with the ORC UNDWINDER enabled. Without skipping functions when doing the stack trace, it now passes again. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4237eba4ef20..2b0d1ee3241c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 3; + stack_trace_max.skip = 0; save_stack_trace(&stack_trace_max); -- cgit v1.2.3 From 18858511fd8a877303cc34c06efa461b26a0e070 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Oct 2018 00:08:20 +0900 Subject: tracing: Return -ENOENT if there is no target synthetic event Return -ENOENT error if there is no target synthetic event. This notices an operation failure to user as below; # echo 'wakeup_latency u64 lat; pid_t pid;' > synthetic_events # echo '!wakeup' >> synthetic_events sh: write error: No such file or directory Link: http://lkml.kernel.org/r/154013449986.25576.9487131386597290172.stgit@devbox Acked-by: Tom Zanussi Tested-by: Tom Zanussi Cc: Shuah Khan Cc: Rajvi Jingar Cc: stable@vger.kernel.org Fixes: 4b147936fa50 ('tracing: Add support for 'synthetic' events') Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d239004aaf29..eb908ef2ecec 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1063,8 +1063,10 @@ static int create_synth_event(int argc, char **argv) event = NULL; ret = -EEXIST; goto out; - } else if (delete_event) + } else if (delete_event) { + ret = -ENOENT; goto out; + } if (argc < 2) { ret = -EINVAL; -- cgit v1.2.3 From 28fa741c27e6d57f6bf594ba3c444ce79e671e09 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 29 Oct 2018 23:32:11 +0000 Subject: perf/core: Clean up inconsisent indentation Replace a bunch of spaces with tab, cleans up indentation Signed-off-by: Colin Ian King Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20181029233211.21475-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a97f34bc14c..65e90c752a91 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -750,7 +750,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) __update_cgrp_time(event->cgrp); } -- cgit v1.2.3 From ea6f650465c61c70b4c96648ed2cb8a0c55db337 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 30 Oct 2018 15:04:51 -0700 Subject: kernel/fail_function.c: remove meaningless null pointer check before debugfs_remove_recursive debugfs_remove_recursive() has taken the null pointer into account. just remove the null check before debugfs_remove_recursive(). Link: http://lkml.kernel.org/r/1537494404-16473-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Acked-by: Masami Hiramatsu Acked-by: Kees Cook Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fail_function.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index bc80a4e268c0..17f75b545f66 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -173,8 +173,7 @@ static void fei_debugfs_remove_attr(struct fei_attr *attr) struct dentry *dir; dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); - if (dir) - debugfs_remove_recursive(dir); + debugfs_remove_recursive(dir); } static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) -- cgit v1.2.3 From 2e58f57d859fbe0ce79e052d7a63bba32d5786e8 Mon Sep 17 00:00:00 2001 From: Weikang Shi Date: Tue, 30 Oct 2018 15:07:05 -0700 Subject: kernel/signal.c: fix a comment error Because get_signal_to_deliver() was renamed to get_signal() the comment should be fixed. Link: http://lkml.kernel.org/r/1539179128-45709-1-git-send-email-swkhack@gmail.com Signed-off-by: Weikang Shi Reported-by: Christian Brauner Cc: Eric W. Biederman Cc: Oleg Nesterov Cc: Anna-Maria Gleixner Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/signal.c | 4 ++-- kernel/signal.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 9a960829a01d..99099f73b207 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -344,10 +344,10 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) get_signal(&ksig); /* - * get_signal_to_deliver() may have run a debugger (via notify_parent()) + * get_signal() may have run a debugger (via notify_parent()) * and the debugger may have modified the state (e.g., to arrange for an * inferior call), thus it's important to check for restarting _after_ - * get_signal_to_deliver(). + * get_signal(). */ if ((long) scr->pt.r10 != -1) /* diff --git a/kernel/signal.c b/kernel/signal.c index 17565240b1c6..9a32bc2088c9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -892,7 +892,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and - * notify its parent. See get_signal_to_deliver(). + * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; -- cgit v1.2.3 From 95c4fb78fb23081472465ca20d5d31c4b780ed82 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 30 Oct 2018 15:07:13 -0700 Subject: kernel/panic.c: do not append newline to the stack protector panic string ... because panic() itself already does this. Otherwise you have line-broken trailer: [ 1.836965] ---[ end Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: pgd_alloc+0x29e/0x2a0 [ 1.836965] ]--- Link: http://lkml.kernel.org/r/20181008202901.7894-1-bp@alien8.de Signed-off-by: Borislav Petkov Acked-by: Kees Cook Cc: Masahiro Yamada Cc: "Steven Rostedt (VMware)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8b2e002d52eb..837a94b7024d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -631,7 +631,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %pB\n", + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); -- cgit v1.2.3 From b49dec1cf8ff1e0b204dd2c30b95a92d75591146 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 30 Oct 2018 15:07:17 -0700 Subject: kernel/panic.c: filter out a potential trailing newline If a call to panic() terminates the string with a \n , the result puts the closing brace ']---' on a newline because panic() itself adds \n too. Now, if one goes and removes the newline chars from all panic() invocations - and the stats right now look like this: ~300 calls with a \n ~500 calls without a \n one is destined to a neverending game of whack-a-mole because the usual thing to do is add a newline at the end of a string a function is supposed to print. Therefore, simply zap any \n at the end of the panic string to avoid touching so many places in the kernel. Link: http://lkml.kernel.org/r/20181009205019.2786-1-bp@alien8.de Signed-off-by: Borislav Petkov Acked-by: Kees Cook Reviewed-by: Steven Rostedt (VMware) Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 837a94b7024d..f6d549a29a5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -136,7 +136,7 @@ void panic(const char *fmt, ...) { static char buf[1024]; va_list args; - long i, i_next = 0; + long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -173,8 +173,12 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* -- cgit v1.2.3 From 6a32c2469c3fbfee8f25bcd20af647326650a6cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 30 Oct 2018 15:07:32 -0700 Subject: kbuild: fix kernel/bounds.c 'W=1' warning Building any configuration with 'make W=1' produces a warning: kernel/bounds.c:16:6: warning: no previous prototype for 'foo' [-Wmissing-prototypes] When also passing -Werror, this prevents us from building any other files. Nobody ever calls the function, but we can't make it 'static' either since we want the compiler output. Calling it 'main' instead however avoids the warning, because gcc does not insist on having a declaration for main. Link: http://lkml.kernel.org/r/20181005083313.2088252-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reported-by: Kieran Bingham Reviewed-by: Kieran Bingham Cc: David Laight Cc: Masahiro Yamada Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bounds.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index c373e887c066..9795d75b09b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -13,7 +13,7 @@ #include #include -void foo(void) +int main(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); @@ -23,4 +23,6 @@ void foo(void) #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ + + return 0; } -- cgit v1.2.3 From eb31d559f1e8390195372cd51cfb198da8bc84b9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 30 Oct 2018 15:08:04 -0700 Subject: memblock: remove _virt from APIs returning virtual address The conversion is done using sed -i 's@memblock_virt_alloc@memblock_alloc@g' \ $(git grep -l memblock_virt_alloc) Link: http://lkml.kernel.org/r/1536927045-23536-8-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Guan Xuetao Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Richard Kuo Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Serge Semin Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/setup.c | 4 ++-- arch/arm/mach-omap2/omap_hwmod.c | 6 ++--- arch/arm64/mm/kasan_init.c | 2 +- arch/arm64/mm/numa.c | 2 +- arch/mips/kernel/setup.c | 2 +- arch/powerpc/kernel/pci_32.c | 2 +- arch/powerpc/lib/alloc.c | 2 +- arch/powerpc/mm/mmu_context_nohash.c | 6 ++--- arch/powerpc/platforms/powermac/nvram.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++--- arch/powerpc/platforms/ps3/setup.c | 2 +- arch/powerpc/sysdev/msi_bitmap.c | 2 +- arch/s390/kernel/setup.c | 8 +++---- arch/s390/kernel/smp.c | 2 +- arch/s390/kernel/topology.c | 4 ++-- arch/s390/numa/mode_emu.c | 2 +- arch/s390/numa/toptree.c | 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/xtensa/mm/kasan_init.c | 2 +- drivers/clk/ti/clk.c | 2 +- drivers/firmware/memmap.c | 2 +- drivers/of/fdt.c | 2 +- drivers/of/unittest.c | 2 +- include/linux/bootmem.h | 38 +++++++++++++++---------------- init/main.c | 6 ++--- kernel/dma/swiotlb.c | 6 ++--- kernel/power/snapshot.c | 2 +- kernel/printk/printk.c | 4 ++-- lib/cpumask.c | 2 +- mm/hugetlb.c | 2 +- mm/kasan/kasan_init.c | 2 +- mm/memblock.c | 26 ++++++++++----------- mm/page_alloc.c | 8 +++---- mm/page_ext.c | 2 +- mm/percpu.c | 28 +++++++++++------------ mm/sparse-vmemmap.c | 2 +- mm/sparse.c | 12 +++++----- 37 files changed, 105 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 4c249cb261f3..39e6090d23ac 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -857,7 +857,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) */ boot_alias_start = phys_to_idmap(start); if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { - res = memblock_virt_alloc(sizeof(*res), 0); + res = memblock_alloc(sizeof(*res), 0); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; res->end = phys_to_idmap(end); @@ -865,7 +865,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&iomem_resource, res); } - res = memblock_virt_alloc(sizeof(*res), 0); + res = memblock_alloc(sizeof(*res), 0); res->name = "System RAM"; res->start = start; res->end = end; diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 56a1fe90d394..1f9b34a7eccd 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -726,7 +726,7 @@ static int __init _setup_clkctrl_provider(struct device_node *np) u64 size; int i; - provider = memblock_virt_alloc(sizeof(*provider), 0); + provider = memblock_alloc(sizeof(*provider), 0); if (!provider) return -ENOMEM; @@ -736,12 +736,12 @@ static int __init _setup_clkctrl_provider(struct device_node *np) of_property_count_elems_of_size(np, "reg", sizeof(u32)) / 2; provider->addr = - memblock_virt_alloc(sizeof(void *) * provider->num_addrs, 0); + memblock_alloc(sizeof(void *) * provider->num_addrs, 0); if (!provider->addr) return -ENOMEM; provider->size = - memblock_virt_alloc(sizeof(u32) * provider->num_addrs, 0); + memblock_alloc(sizeof(u32) * provider->num_addrs, 0); if (!provider->size) return -ENOMEM; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index fccb1a6f8c6f..6a65a2912d36 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -38,7 +38,7 @@ static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); static phys_addr_t __init kasan_alloc_zeroed_page(int node) { - void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + void *p = memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node); return __pa(p); diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index c7fb34efd23e..0bff116c07a8 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -168,7 +168,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, { int nid = early_cpu_to_node(cpu); - return memblock_virt_alloc_try_nid(size, align, + return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); } diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 01a5ff4c41ff..0c997645e8f0 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -859,7 +859,7 @@ static void __init arch_mem_init(char **cmdline_p) * Prevent memblock from allocating high memory. * This cannot be done before max_low_pfn is detected, so up * to this point is possible to only reserve physical memory - * with memblock_reserve; memblock_virt_alloc* can be used + * with memblock_reserve; memblock_alloc* can be used * only after this point */ memblock_set_current_limit(PFN_PHYS(max_low_pfn)); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 4da8ed576229..d39ec3a4550a 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -203,7 +203,7 @@ pci_create_OF_bus_map(void) struct property* of_prop; struct device_node *dn; - of_prop = memblock_virt_alloc(sizeof(struct property) + 256, 0); + of_prop = memblock_alloc(sizeof(struct property) + 256, 0); dn = of_find_node_by_path("/"); if (dn) { memset(of_prop, -1, sizeof(struct property) + 256); diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index 06796dec01ea..bf87d6e13369 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -14,7 +14,7 @@ void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) if (slab_is_available()) p = kzalloc(size, mask); else { - p = memblock_virt_alloc(size, 0); + p = memblock_alloc(size, 0); } return p; } diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 4d80239ef83c..954f1986af4d 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -461,10 +461,10 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0); - context_mm = memblock_virt_alloc(sizeof(void *) * (LAST_CONTEXT + 1), 0); + context_map = memblock_alloc(CTX_MAP_SIZE, 0); + context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), 0); #ifdef CONFIG_SMP - stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0); + stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, 0); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index 60b03a1703d1..f45b369177a4 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -513,7 +513,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr) printk(KERN_ERR "nvram: no address\n"); return -EINVAL; } - nvram_image = memblock_virt_alloc(NVRAM_SIZE, 0); + nvram_image = memblock_alloc(NVRAM_SIZE, 0); nvram_data = ioremap(addr, NVRAM_SIZE*2); nvram_naddrs = 1; /* Make sure we get the correct case */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cde710297a4e..23a67b545b70 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3770,7 +3770,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb_id = be64_to_cpup(prop64); pr_debug(" PHB-ID : 0x%016llx\n", phb_id); - phb = memblock_virt_alloc(sizeof(*phb), 0); + phb = memblock_alloc(sizeof(*phb), 0); /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); @@ -3816,7 +3816,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; - phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0); + phb->diag_data = memblock_alloc(phb->diag_data_size, 0); /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); @@ -3875,7 +3875,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, } pemap_off = size; size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); - aux = memblock_virt_alloc(size, 0); + aux = memblock_alloc(size, 0); phb->ioda.pe_alloc = aux; phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 77a37520068d..12519857a33c 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -126,7 +126,7 @@ static void __init prealloc(struct ps3_prealloc *p) if (!p->size) return; - p->address = memblock_virt_alloc(p->size, p->align); + p->address = memblock_alloc(p->size, p->align); printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size, p->address); diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index e64a411d1a00..349a9ff6ca5b 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -128,7 +128,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, if (bmp->bitmap_from_slab) bmp->bitmap = kzalloc(size, GFP_KERNEL); else { - bmp->bitmap = memblock_virt_alloc(size, 0); + bmp->bitmap = memblock_alloc(size, 0); /* the bitmap won't be freed from memblock allocator */ kmemleak_not_leak(bmp->bitmap); } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 204ccfa54bf3..781c1053a773 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -378,7 +378,7 @@ static void __init setup_lowcore(void) * Setup lowcore for boot cpu */ BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); - lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); + lc = memblock_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; lc->external_new_psw.mask = PSW_KERNEL_BITS | @@ -422,7 +422,7 @@ static void __init setup_lowcore(void) * Allocate the global restart stack which is the same for * all CPUs in cast *one* of them does a PSW restart. */ - restart_stack = memblock_virt_alloc(THREAD_SIZE, THREAD_SIZE); + restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE); restart_stack += STACK_INIT_OFFSET; /* @@ -488,7 +488,7 @@ static void __init setup_resources(void) bss_resource.end = (unsigned long) __bss_stop - 1; for_each_memblock(memory, reg) { - res = memblock_virt_alloc(sizeof(*res), 8); + res = memblock_alloc(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -502,7 +502,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_virt_alloc(sizeof(*sub_res), 8); + sub_res = memblock_alloc(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1b3188f57b58..44f9a7d6450b 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -761,7 +761,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_virt_alloc(sizeof(*info), 8); + info = memblock_alloc(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index e8184a15578a..799a91882a76 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -519,7 +519,7 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_virt_alloc(sizeof(*mask->next), 8); + mask->next = memblock_alloc(sizeof(*mask->next), 8); mask = mask->next; } } @@ -537,7 +537,7 @@ void __init topology_init_early(void) } if (!MACHINE_HAS_TOPOLOGY) goto out; - tl_info = memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c index 83b222c57609..5a381fc8e958 100644 --- a/arch/s390/numa/mode_emu.c +++ b/arch/s390/numa/mode_emu.c @@ -313,7 +313,7 @@ static void __ref create_core_to_node_map(void) { int i; - emu_cores = memblock_virt_alloc(sizeof(*emu_cores), 8); + emu_cores = memblock_alloc(sizeof(*emu_cores), 8); for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) emu_cores->to_node_id[i] = NODE_ID_FREE; } diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c index 21d1e8a1546d..7f61cc3fd4d1 100644 --- a/arch/s390/numa/toptree.c +++ b/arch/s390/numa/toptree.c @@ -34,7 +34,7 @@ struct toptree __ref *toptree_alloc(int level, int id) if (slab_is_available()) res = kzalloc(sizeof(*res), GFP_KERNEL); else - res = memblock_virt_alloc(sizeof(*res), 8); + res = memblock_alloc(sizeof(*res), 8); if (!res) return res; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index e3e77527f8df..77b857cb036f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -28,10 +28,10 @@ static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); static __init void *early_alloc(size_t size, int nid, bool panic) { if (panic) - return memblock_virt_alloc_try_nid(size, size, + return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); else - return memblock_virt_alloc_try_nid_nopanic(size, size, + return memblock_alloc_try_nid_nopanic(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); } diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index 6b532b6bd785..1a30a258ccd0 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -43,7 +43,7 @@ static void __init populate(void *start, void *end) unsigned long vaddr = (unsigned long)start; pgd_t *pgd = pgd_offset_k(vaddr); pmd_t *pmd = pmd_offset(pgd, vaddr); - pte_t *pte = memblock_virt_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); + pte_t *pte = memblock_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); pr_debug("%s: %p - %p\n", __func__, start, end); diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index 7d22e1af2247..5c54d3734daf 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -342,7 +342,7 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem) { struct clk_iomap *io; - io = memblock_virt_alloc(sizeof(*io), 0); + io = memblock_alloc(sizeof(*io), 0); io->mem = mem; diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 5de3ed29282c..03cead6d5f97 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -333,7 +333,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0); + entry = memblock_alloc(sizeof(struct firmware_map_entry), 0); if (WARN_ON(!entry)) return -ENOMEM; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 4f915cea6f75..ffe62a7ae19b 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1179,7 +1179,7 @@ int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, static void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) { - return memblock_virt_alloc(size, align); + return memblock_alloc(size, align); } bool __init early_init_dt_verify(void *params) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index a3a6866765f2..01e23b85e798 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -2192,7 +2192,7 @@ static struct device_node *overlay_base_root; static void * __init dt_alloc_memory(u64 size, u64 align) { - return memblock_virt_alloc(size, align); + return memblock_alloc(size, align); } /* diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b74bafd110b9..7d91f0f5ee44 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -95,78 +95,78 @@ extern void *__alloc_bootmem_low(unsigned long size, #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ -void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, +void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); -void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, +void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); -void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align, +void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); void __memblock_free_early(phys_addr_t base, phys_addr_t size); void __memblock_free_late(phys_addr_t base, phys_addr_t size); -static inline void * __init memblock_virt_alloc( +static inline void * __init memblock_alloc( phys_addr_t size, phys_addr_t align) { - return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, + return memblock_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_raw( +static inline void * __init memblock_alloc_raw( phys_addr_t size, phys_addr_t align) { - return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, + return memblock_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_nopanic( +static inline void * __init memblock_alloc_nopanic( phys_addr_t size, phys_addr_t align) { - return memblock_virt_alloc_try_nid_nopanic(size, align, + return memblock_alloc_try_nid_nopanic(size, align, BOOTMEM_LOW_LIMIT, BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_low( +static inline void * __init memblock_alloc_low( phys_addr_t size, phys_addr_t align) { - return memblock_virt_alloc_try_nid(size, align, + return memblock_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_low_nopanic( +static inline void * __init memblock_alloc_low_nopanic( phys_addr_t size, phys_addr_t align) { - return memblock_virt_alloc_try_nid_nopanic(size, align, + return memblock_alloc_try_nid_nopanic(size, align, BOOTMEM_LOW_LIMIT, ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_from_nopanic( +static inline void * __init memblock_alloc_from_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) { - return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr, + return memblock_alloc_try_nid_nopanic(size, align, min_addr, BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } -static inline void * __init memblock_virt_alloc_node( +static inline void * __init memblock_alloc_node( phys_addr_t size, int nid) { - return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, + return memblock_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, BOOTMEM_ALLOC_ACCESSIBLE, nid); } -static inline void * __init memblock_virt_alloc_node_nopanic( +static inline void * __init memblock_alloc_node_nopanic( phys_addr_t size, int nid) { - return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, + return memblock_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, BOOTMEM_ALLOC_ACCESSIBLE, nid); } diff --git a/init/main.c b/init/main.c index 1c3f90264280..86b59cf3bec7 100644 --- a/init/main.c +++ b/init/main.c @@ -375,10 +375,10 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } static void __init setup_command_line(char *command_line) { saved_command_line = - memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + memblock_alloc(strlen(boot_command_line) + 1, 0); initcall_command_line = - memblock_virt_alloc(strlen(boot_command_line) + 1, 0); - static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); + memblock_alloc(strlen(boot_command_line) + 1, 0); + static_command_line = memblock_alloc(strlen(command_line) + 1, 0); strcpy(saved_command_line, boot_command_line); strcpy(static_command_line, command_line); } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ebecaf255ea2..801da67e957b 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -204,10 +204,10 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_virt_alloc( + io_tlb_list = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), PAGE_SIZE); - io_tlb_orig_addr = memblock_virt_alloc( + io_tlb_orig_addr = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { @@ -242,7 +242,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3d37c279c090..34116a6097be 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -963,7 +963,7 @@ void __init __register_nosave_region(unsigned long start_pfn, BUG_ON(!region); } else { /* This allocation cannot fail */ - region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + region = memblock_alloc(sizeof(struct nosave_region), 0); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b77150ad1965..429e4a3833ca 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1111,9 +1111,9 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); + memblock_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + new_log_buf = memblock_alloc_nopanic(new_log_buf_len, LOG_ALIGN); } diff --git a/lib/cpumask.c b/lib/cpumask.c index beca6244671a..1405cb22e6bc 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -163,7 +163,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_virt_alloc(cpumask_size(), 0); + *mask = memblock_alloc(cpumask_size(), 0); } /** diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b5c0ad9a6bd..51e9f17dbd5c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2100,7 +2100,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_raw( + addr = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 7a2a2f13f86f..24d734bdff6b 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -83,7 +83,7 @@ static inline bool kasan_zero_page_entry(pte_t pte) static __init void *early_alloc(size_t size, int node) { - return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } diff --git a/mm/memblock.c b/mm/memblock.c index 20358374e8a8..58340de3ebc6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1319,7 +1319,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali } /** - * memblock_virt_alloc_internal - allocate boot memory block + * memblock_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region to allocate (phys address) @@ -1345,7 +1345,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -static void * __init memblock_virt_alloc_internal( +static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1412,7 +1412,7 @@ done: } /** - * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size @@ -1430,7 +1430,7 @@ done: * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_raw( +void * __init memblock_alloc_try_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1441,7 +1441,7 @@ void * __init memblock_virt_alloc_try_nid_raw( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr && size > 0) page_init_poison(ptr, size); @@ -1450,7 +1450,7 @@ void * __init memblock_virt_alloc_try_nid_raw( } /** - * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation @@ -1466,7 +1466,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_nopanic( +void * __init memblock_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1477,7 +1477,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) memset(ptr, 0, size); @@ -1485,7 +1485,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( } /** - * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * memblock_alloc_try_nid - allocate boot memory block with panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation @@ -1495,14 +1495,14 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid( +void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1512,7 +1512,7 @@ void * __init memblock_virt_alloc_try_nid( memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { memset(ptr, 0, size); @@ -1529,7 +1529,7 @@ void * __init memblock_virt_alloc_try_nid( * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59d171f84445..8ca6954fdcdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6209,7 +6209,7 @@ static void __ref setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = - memblock_virt_alloc_node_nopanic(usemapsize, + memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); } #else @@ -6439,7 +6439,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -7711,9 +7711,9 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_virt_alloc_nopanic(size, 0); + table = memblock_alloc_nopanic(size, 0); else - table = memblock_virt_alloc_raw(size, 0); + table = memblock_alloc_raw(size, 0); } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); } else { diff --git a/mm/page_ext.c b/mm/page_ext.c index a9826da84ccb..e77c0f031dd0 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -161,7 +161,7 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_virt_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) diff --git a/mm/percpu.c b/mm/percpu.c index 4b90682623e9..3050c1d37d37 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1101,7 +1101,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + + chunk = memblock_alloc(sizeof(struct pcpu_chunk) + BITS_TO_LONGS(region_size >> PAGE_SHIFT), 0); @@ -1114,11 +1114,11 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * + chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * + chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), 0); - chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * + chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), 0); pcpu_init_md_blocks(chunk); @@ -1888,7 +1888,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2075,12 +2075,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_virt_alloc(ai->nr_groups * + group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), 0); - group_sizes = memblock_virt_alloc(ai->nr_groups * + group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), 0); - unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); - unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2144,7 +2144,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_virt_alloc( + pcpu_slot = memblock_alloc( pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2458,7 +2458,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_virt_alloc_nopanic(areas_size, 0); + areas = memblock_alloc_nopanic(areas_size, 0); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2599,7 +2599,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_virt_alloc(pages_size, 0); + pages = memblock_alloc(pages_size, 0); /* allocate pages */ j = 0; @@ -2688,7 +2688,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_virt_alloc_from_nopanic( + return memblock_alloc_from_nopanic( size, align, __pa(MAX_DMA_ADDRESS)); } @@ -2737,7 +2737,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_virt_alloc_from_nopanic(unit_size, + fc = memblock_alloc_from_nopanic(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8301293331a2..91c2c3d25827 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid_raw(size, align, goal, + return memblock_alloc_try_nid_raw(size, align, goal, BOOTMEM_ALLOC_ACCESSIBLE, node); } diff --git a/mm/sparse.c b/mm/sparse.c index 67ad061f7fb8..cb900dda7fd2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -68,7 +68,7 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) if (slab_is_available()) section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = memblock_virt_alloc_node(array_size, nid); + section = memblock_alloc_node(array_size, nid); return section; } @@ -216,7 +216,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); + mem_section = memblock_alloc(size, align); } #endif @@ -306,7 +306,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_virt_alloc_try_nid_nopanic(size, + p = memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { @@ -362,7 +362,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -391,7 +391,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, if (map) return map; - map = memblock_virt_alloc_try_nid(size, + map = memblock_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; @@ -405,7 +405,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = - memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + memblock_alloc_try_nid_raw(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; -- cgit v1.2.3 From 57c8a661d95dff48dd9c2f2496139082bbaf241a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 30 Oct 2018 15:09:49 -0700 Subject: mm: remove include/linux/bootmem.h Move remaining definitions and declarations from include/linux/bootmem.h into include/linux/memblock.h and remove the redundant header. The includes were replaced with the semantic patch below and then semi-automated removal of duplicated '#include @@ @@ - #include + #include [sfr@canb.auug.org.au: dma-direct: fix up for the removal of linux/bootmem.h] Link: http://lkml.kernel.org/r/20181002185342.133d1680@canb.auug.org.au [sfr@canb.auug.org.au: powerpc: fix up for removal of linux/bootmem.h] Link: http://lkml.kernel.org/r/20181005161406.73ef8727@canb.auug.org.au [sfr@canb.auug.org.au: x86/kaslr, ACPI/NUMA: fix for linux/bootmem.h removal] Link: http://lkml.kernel.org/r/20181008190341.5e396491@canb.auug.org.au Link: http://lkml.kernel.org/r/1536927045-23536-30-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Stephen Rothwell Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Guan Xuetao Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Jonathan Corbet Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Richard Kuo Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Serge Semin Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_cia.c | 2 +- arch/alpha/kernel/core_irongate.c | 1 - arch/alpha/kernel/core_marvel.c | 2 +- arch/alpha/kernel/core_titan.c | 2 +- arch/alpha/kernel/core_tsunami.c | 2 +- arch/alpha/kernel/pci-noop.c | 2 +- arch/alpha/kernel/pci.c | 2 +- arch/alpha/kernel/pci_iommu.c | 2 +- arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/sys_nautilus.c | 2 +- arch/alpha/mm/init.c | 2 +- arch/alpha/mm/numa.c | 1 - arch/arc/kernel/unwind.c | 2 +- arch/arc/mm/highmem.c | 2 +- arch/arc/mm/init.c | 1 - arch/arm/kernel/devtree.c | 1 - arch/arm/kernel/setup.c | 1 - arch/arm/mach-omap2/omap_hwmod.c | 2 +- arch/arm/mm/dma-mapping.c | 1 - arch/arm/mm/init.c | 1 - arch/arm/xen/mm.c | 1 - arch/arm/xen/p2m.c | 2 +- arch/arm64/kernel/acpi.c | 1 - arch/arm64/kernel/acpi_numa.c | 1 - arch/arm64/kernel/setup.c | 1 - arch/arm64/mm/dma-mapping.c | 2 +- arch/arm64/mm/init.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/numa.c | 1 - arch/c6x/kernel/setup.c | 1 - arch/c6x/mm/init.c | 2 +- arch/h8300/kernel/setup.c | 1 - arch/h8300/mm/init.c | 2 +- arch/hexagon/kernel/dma.c | 2 +- arch/hexagon/kernel/setup.c | 2 +- arch/hexagon/mm/init.c | 1 - arch/ia64/kernel/crash.c | 2 +- arch/ia64/kernel/efi.c | 2 +- arch/ia64/kernel/ia64_ksyms.c | 2 +- arch/ia64/kernel/iosapic.c | 2 +- arch/ia64/kernel/mca.c | 2 +- arch/ia64/kernel/mca_drv.c | 2 +- arch/ia64/kernel/setup.c | 1 - arch/ia64/kernel/smpboot.c | 2 +- arch/ia64/kernel/topology.c | 2 +- arch/ia64/kernel/unwind.c | 2 +- arch/ia64/mm/contig.c | 1 - arch/ia64/mm/discontig.c | 1 - arch/ia64/mm/init.c | 1 - arch/ia64/mm/numa.c | 2 +- arch/ia64/mm/tlb.c | 2 +- arch/ia64/pci/pci.c | 2 +- arch/ia64/sn/kernel/bte.c | 2 +- arch/ia64/sn/kernel/io_common.c | 2 +- arch/ia64/sn/kernel/setup.c | 2 +- arch/m68k/atari/stram.c | 2 +- arch/m68k/coldfire/m54xx.c | 2 +- arch/m68k/kernel/setup_mm.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/uboot.c | 2 +- arch/m68k/mm/init.c | 2 +- arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/motorola.c | 1 - arch/m68k/mm/sun3mmu.c | 2 +- arch/m68k/sun3/config.c | 2 +- arch/m68k/sun3/dvma.c | 2 +- arch/m68k/sun3/mmu_emu.c | 2 +- arch/m68k/sun3/sun3dvma.c | 2 +- arch/m68k/sun3x/dvma.c | 2 +- arch/microblaze/mm/consistent.c | 2 +- arch/microblaze/mm/init.c | 3 +- arch/microblaze/pci/pci-common.c | 2 +- arch/mips/ar7/memory.c | 2 +- arch/mips/ath79/setup.c | 2 +- arch/mips/bcm63xx/prom.c | 2 +- arch/mips/bcm63xx/setup.c | 2 +- arch/mips/bmips/setup.c | 2 +- arch/mips/cavium-octeon/dma-octeon.c | 2 +- arch/mips/dec/prom/memory.c | 2 +- arch/mips/emma/common/prom.c | 2 +- arch/mips/fw/arc/memory.c | 2 +- arch/mips/jazz/jazzdma.c | 2 +- arch/mips/kernel/crash.c | 2 +- arch/mips/kernel/crash_dump.c | 2 +- arch/mips/kernel/prom.c | 2 +- arch/mips/kernel/setup.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kernel/vpe.c | 2 +- arch/mips/kvm/commpage.c | 2 +- arch/mips/kvm/dyntrans.c | 2 +- arch/mips/kvm/emulate.c | 2 +- arch/mips/kvm/interrupt.c | 2 +- arch/mips/kvm/mips.c | 2 +- arch/mips/lantiq/prom.c | 2 +- arch/mips/lasat/prom.c | 2 +- arch/mips/loongson64/common/init.c | 2 +- arch/mips/loongson64/loongson-3/numa.c | 1 - arch/mips/mm/init.c | 2 +- arch/mips/mm/pgtable-32.c | 2 +- arch/mips/mti-malta/malta-memory.c | 2 +- arch/mips/netlogic/xlp/dt.c | 2 +- arch/mips/pci/pci-legacy.c | 2 +- arch/mips/pci/pci.c | 2 +- arch/mips/ralink/of.c | 2 +- arch/mips/rb532/prom.c | 2 +- arch/mips/sgi-ip27/ip27-memory.c | 1 - arch/mips/sibyte/common/cfe.c | 2 +- arch/mips/sibyte/swarm/setup.c | 2 +- arch/mips/txx9/rbtx4938/prom.c | 2 +- arch/nds32/kernel/setup.c | 3 +- arch/nds32/mm/highmem.c | 2 +- arch/nds32/mm/init.c | 3 +- arch/nios2/kernel/prom.c | 2 +- arch/nios2/kernel/setup.c | 1 - arch/nios2/mm/init.c | 2 +- arch/openrisc/kernel/setup.c | 3 +- arch/openrisc/mm/init.c | 3 +- arch/parisc/mm/init.c | 1 - arch/powerpc/kernel/pci_32.c | 2 +- arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/kernel/setup_64.c | 3 +- arch/powerpc/lib/alloc.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/mem.c | 3 +- arch/powerpc/mm/mmu_context_nohash.c | 2 +- arch/powerpc/mm/numa.c | 3 +- arch/powerpc/platforms/powermac/nvram.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 3 +- arch/powerpc/platforms/ps3/setup.c | 2 +- arch/powerpc/sysdev/msi_bitmap.c | 2 +- arch/riscv/mm/init.c | 3 +- arch/s390/kernel/crash_dump.c | 3 +- arch/s390/kernel/setup.c | 1 - arch/s390/kernel/smp.c | 3 +- arch/s390/kernel/topology.c | 2 +- arch/s390/kernel/vdso.c | 2 +- arch/s390/mm/extmem.c | 2 +- arch/s390/mm/init.c | 3 +- arch/s390/mm/vmem.c | 3 +- arch/s390/numa/mode_emu.c | 1 - arch/s390/numa/numa.c | 1 - arch/s390/numa/toptree.c | 2 +- arch/sh/mm/init.c | 3 +- arch/sh/mm/ioremap_fixed.c | 2 +- arch/sparc/kernel/mdesc.c | 2 - arch/sparc/kernel/prom_32.c | 2 +- arch/sparc/kernel/setup_64.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 3 +- arch/sparc/mm/srmmu.c | 2 +- arch/um/drivers/net_kern.c | 2 +- arch/um/drivers/vector_kern.c | 2 +- arch/um/kernel/initrd.c | 2 +- arch/um/kernel/mem.c | 1 - arch/um/kernel/physmem.c | 1 - arch/unicore32/kernel/hibernate.c | 2 +- arch/unicore32/kernel/setup.c | 3 +- arch/unicore32/mm/init.c | 3 +- arch/unicore32/mm/mmu.c | 1 - arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/acpi/sleep.c | 1 - arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/e820.c | 3 +- arch/x86/kernel/mpparse.c | 1 - arch/x86/kernel/pci-dma.c | 2 +- arch/x86/kernel/pci-swiotlb.c | 2 +- arch/x86/kernel/pvclock.c | 2 +- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/setup_percpu.c | 1 - arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tce_64.c | 1 - arch/x86/mm/amdtopology.c | 1 - arch/x86/mm/fault.c | 2 +- arch/x86/mm/highmem_32.c | 2 +- arch/x86/mm/init.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/kasan_init_64.c | 3 +- arch/x86/mm/kaslr.c | 1 + arch/x86/mm/numa.c | 1 - arch/x86/mm/numa_32.c | 1 - arch/x86/mm/numa_64.c | 2 +- arch/x86/mm/numa_emulation.c | 1 - arch/x86/mm/pageattr-test.c | 2 +- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 2 +- arch/x86/mm/physaddr.c | 2 +- arch/x86/pci/i386.c | 2 +- arch/x86/platform/efi/efi.c | 3 +- arch/x86/platform/efi/efi_64.c | 2 +- arch/x86/platform/efi/quirks.c | 1 - arch/x86/platform/olpc/olpc_dt.c | 2 +- arch/x86/power/hibernate_32.c | 2 +- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/enlighten_pv.c | 3 +- arch/x86/xen/p2m.c | 1 - arch/xtensa/kernel/pci.c | 2 +- arch/xtensa/mm/cache.c | 2 +- arch/xtensa/mm/init.c | 2 +- arch/xtensa/mm/kasan_init.c | 3 +- arch/xtensa/mm/mmu.c | 2 +- arch/xtensa/platforms/iss/network.c | 2 +- arch/xtensa/platforms/iss/setup.c | 2 +- block/blk-settings.c | 2 +- block/bounce.c | 2 +- drivers/acpi/numa.c | 1 - drivers/acpi/tables.c | 3 +- drivers/base/platform.c | 2 +- drivers/clk/ti/clk.c | 2 +- drivers/firmware/dmi_scan.c | 2 +- drivers/firmware/efi/apple-properties.c | 2 +- drivers/firmware/iscsi_ibft_find.c | 2 +- drivers/firmware/memmap.c | 2 +- drivers/iommu/mtk_iommu.c | 2 +- drivers/iommu/mtk_iommu_v1.c | 2 +- drivers/macintosh/smu.c | 3 +- drivers/mtd/ar7part.c | 2 +- drivers/net/arcnet/arc-rimi.c | 2 +- drivers/net/arcnet/com20020-isa.c | 2 +- drivers/net/arcnet/com90io.c | 2 +- drivers/of/fdt.c | 1 - drivers/of/unittest.c | 2 +- drivers/s390/char/fs3270.c | 2 +- drivers/s390/char/tty3270.c | 2 +- drivers/s390/cio/cmf.c | 2 +- drivers/s390/virtio/virtio_ccw.c | 2 +- drivers/sfi/sfi_core.c | 2 +- drivers/tty/serial/cpm_uart/cpm_uart_core.c | 2 +- drivers/tty/serial/cpm_uart/cpm_uart_cpm1.c | 2 +- drivers/tty/serial/cpm_uart/cpm_uart_cpm2.c | 2 +- drivers/usb/early/xhci-dbc.c | 1 - drivers/xen/balloon.c | 2 +- drivers/xen/events/events_base.c | 2 +- drivers/xen/grant-table.c | 2 +- drivers/xen/swiotlb-xen.c | 1 - drivers/xen/xen-selfballoon.c | 2 +- fs/dcache.c | 2 +- fs/inode.c | 2 +- fs/namespace.c | 2 +- fs/proc/kcore.c | 2 +- fs/proc/page.c | 2 +- fs/proc/vmcore.c | 2 +- include/linux/bootmem.h | 173 ---------------------------- include/linux/memblock.h | 151 +++++++++++++++++++++++- init/main.c | 2 +- kernel/dma/direct.c | 2 +- kernel/dma/swiotlb.c | 2 +- kernel/futex.c | 2 +- kernel/locking/qspinlock_paravirt.h | 2 +- kernel/pid.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/printk/printk.c | 1 - kernel/profile.c | 2 +- lib/cpumask.c | 2 +- mm/hugetlb.c | 1 - mm/kasan/kasan_init.c | 3 +- mm/kmemleak.c | 2 +- mm/memblock.c | 1 - mm/memory_hotplug.c | 1 - mm/page_alloc.c | 1 - mm/page_ext.c | 2 +- mm/page_idle.c | 2 +- mm/page_owner.c | 2 +- mm/percpu.c | 2 +- mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- net/sctp/protocol.c | 2 +- net/xfrm/xfrm_hash.c | 2 +- 275 files changed, 353 insertions(+), 476 deletions(-) delete mode 100644 include/linux/bootmem.h (limited to 'kernel') diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index 026ee955fd10..867e8730b0c5 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index 35572be9deb5..a9fd133a7fb2 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 1f00c9433b10..8a568c4d8e81 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c index 132b06bdf903..97551597581b 100644 --- a/arch/alpha/kernel/core_titan.c +++ b/arch/alpha/kernel/core_titan.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/core_tsunami.c b/arch/alpha/kernel/core_tsunami.c index e7c956ea46b6..f334b8928d72 100644 --- a/arch/alpha/kernel/core_tsunami.c +++ b/arch/alpha/kernel/core_tsunami.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index 59cbfc2bf2c5..a9378ee0c2f1 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 4cc3eb92f55b..13937e72d875 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 5d178c7ba5b2..82cf950bda2a 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 64c06a0adf3d..a37fd990bd55 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index ff4f54b86c7f..cd9a112d67ff 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 853d15344934..a42fc5c4db89 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -19,7 +19,7 @@ #include #include #include -#include /* max_low_pfn */ +#include /* max_low_pfn */ #include #include diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 26cd925d19b1..74846553e3f1 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index 2a01dd1005f4..d34f69eb1a95 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index f582dc8944c9..48e700151810 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -7,7 +7,7 @@ * */ -#include +#include #include #include #include diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 0f29c6548779..f8fe5668b30f 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -8,7 +8,6 @@ #include #include -#include #include #ifdef CONFIG_BLK_DEV_INITRD #include diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c index 13bcd3b867cb..e3057c1b55b9 100644 --- a/arch/arm/kernel/devtree.c +++ b/arch/arm/kernel/devtree.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 39e6090d23ac..840a4adc69fc 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 1f9b34a7eccd..cd5732ab0cdf 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -141,7 +141,7 @@ #include #include #include -#include +#include #include diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 66566472c153..661fe48ab78d 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -9,7 +9,6 @@ * * DMA uncached mapping support. */ -#include #include #include #include diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index d421a10c93a8..32e4845af2b6 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 785d2a562a23..cb44aa290e73 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index 0641ba54ab62..e70a49fc8dcd 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index ed46dc188b22..44e3c351e1ea 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -16,7 +16,6 @@ #define pr_fmt(fmt) "ACPI: " fmt #include -#include #include #include #include diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index 4f4f1815e047..eac1d0cc595c 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 3428427f6c93..7ce7306f1d75 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index d190612b8f33..3a703e5d4e32 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d8d73073835f..9d9582cac6c4 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 6a65a2912d36..63527e585aac 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -11,7 +11,6 @@ */ #define pr_fmt(fmt) "kasan: " fmt -#include #include #include #include diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 0bff116c07a8..27a31efd9e8e 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -20,7 +20,6 @@ #define pr_fmt(fmt) "NUMA: " fmt #include -#include #include #include #include diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c index 05d96a9541b5..2e1c0ea22eb0 100644 --- a/arch/c6x/kernel/setup.c +++ b/arch/c6x/kernel/setup.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 3383df8b3508..af5ada0520be 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #ifdef CONFIG_BLK_DEV_RAM #include #endif diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index 34e2df5c0d6d..b32bfa1fe99e 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index f2bf4487aabd..6519252ac4db 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c index 706699374444..38eaa7b703e7 100644 --- a/arch/hexagon/kernel/dma.c +++ b/arch/hexagon/kernel/dma.c @@ -19,7 +19,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/hexagon/kernel/setup.c b/arch/hexagon/kernel/setup.c index dc8c7e75b5d1..b3c3e04d4e57 100644 --- a/arch/hexagon/kernel/setup.c +++ b/arch/hexagon/kernel/setup.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 88643faf3981..1719ede9e9bd 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 39f4433a6f0e..bec762a9b418 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f77d80edddfe..8f106638913c 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -23,7 +23,7 @@ * Skip non-WB memory and ignore empty memory ranges. */ #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 6b51c88e3578..b49fe6f618ed 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -6,7 +6,7 @@ #ifdef CONFIG_VIRTUAL_MEM_MAP #include #include -#include +#include EXPORT_SYMBOL(min_low_pfn); /* defined by bootmem.c, but not exported by generic code */ EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ #endif diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 550243a94b5d..fe6e4946672e 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -90,7 +90,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 71209766c47f..9a6603f8e409 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -77,7 +77,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index dfe40cbdf3b3..45f956ad715a 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 0e6c2d9fb498..583a3746d70b 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 74fe317477e6..51ec944b036c 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 9b820f7a6a98..e311ee13e61d 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c index e04efa088902..7601fe0622d2 100644 --- a/arch/ia64/kernel/unwind.c +++ b/arch/ia64/kernel/unwind.c @@ -28,7 +28,7 @@ * acquired, then the read-write lock must be acquired first. */ #include -#include +#include #include #include #include diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 9e5c23a6b8b4..6e447234205c 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -14,7 +14,6 @@ * Routines used by ia64 machines with contiguous (or virtually contiguous) * memory. */ -#include #include #include #include diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 70609f823960..8a965784340c 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 43ea4a47163d..d5e12ff1d73c 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index aa19b7ac8222..3861d6e32d5f 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 5554863b4c9b..ab545daff7c3 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 5d71800df431..196a0dd7ff97 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c index 9146192b86f5..9900e6d4add6 100644 --- a/arch/ia64/sn/kernel/bte.c +++ b/arch/ia64/sn/kernel/bte.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c index 8b05d5581615..98f55220c67d 100644 --- a/arch/ia64/sn/kernel/io_common.c +++ b/arch/ia64/sn/kernel/io_common.c @@ -6,7 +6,7 @@ * Copyright (C) 2006 Silicon Graphics, Inc. All rights reserved. */ -#include +#include #include #include #include diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index ab2564f95199..71ad6b0ccab4 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c index 1089d67df315..6ffc204eb07d 100644 --- a/arch/m68k/atari/stram.c +++ b/arch/m68k/atari/stram.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/coldfire/m54xx.c b/arch/m68k/coldfire/m54xx.c index adad03ca6e11..360c723c0ae6 100644 --- a/arch/m68k/coldfire/m54xx.c +++ b/arch/m68k/coldfire/m54xx.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 5d3596c180f9..a1a3eaeaf58c 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/kernel/setup_no.c b/arch/m68k/kernel/setup_no.c index cfd5475bfc31..3c5def10d486 100644 --- a/arch/m68k/kernel/setup_no.c +++ b/arch/m68k/kernel/setup_no.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/kernel/uboot.c b/arch/m68k/kernel/uboot.c index 107082877064..1b4c562753da 100644 --- a/arch/m68k/kernel/uboot.c +++ b/arch/m68k/kernel/uboot.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index ae49ae4d3049..933c33e76a48 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 38a1d92dd555..0de4999a3810 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 2113eec8dbf9..7497cf30bf1c 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 19c05ab9824d..f736db48a2e1 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index 79a2bb857906..542c4404861c 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m68k/sun3/dvma.c b/arch/m68k/sun3/dvma.c index 5f92c72b05c3..a2c1c9304895 100644 --- a/arch/m68k/sun3/dvma.c +++ b/arch/m68k/sun3/dvma.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index d30da12a1702..582a1284059a 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index 72d94585b52e..8be8b750c629 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -7,7 +7,7 @@ * Contains common routines for sun3/sun3x DVMA management. */ -#include +#include #include #include #include diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c index b2acbc862f60..89e630e66555 100644 --- a/arch/m68k/sun3x/dvma.c +++ b/arch/m68k/sun3x/dvma.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index d801cc5f5b95..45e0a1aa9357 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 9989740d397a..8c14988f52f2 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -7,10 +7,9 @@ * for more details. */ -#include +#include #include #include -#include #include /* mem_init */ #include #include diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 2ffd171af8b6..6b89a66ec1a5 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/ar7/memory.c b/arch/mips/ar7/memory.c index 0332f0514d05..80390a9ec264 100644 --- a/arch/mips/ar7/memory.c +++ b/arch/mips/ar7/memory.c @@ -16,7 +16,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include +#include #include #include #include diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 4c7a93f4039a..9728abcb18fa 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/bcm63xx/prom.c b/arch/mips/bcm63xx/prom.c index 7019e2967009..77a836e661c9 100644 --- a/arch/mips/bcm63xx/prom.c +++ b/arch/mips/bcm63xx/prom.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/mips/bcm63xx/setup.c b/arch/mips/bcm63xx/setup.c index 2be9caaa2085..e28ee9a7cc7e 100644 --- a/arch/mips/bcm63xx/setup.c +++ b/arch/mips/bcm63xx/setup.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index 6329c5f780d6..1738a06396f9 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index c44c1a654471..e8eb60ed99f2 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -11,7 +11,7 @@ * Copyright (C) 2010 Cavium Networks, Inc. */ #include -#include +#include #include #include #include diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index a2acc6454cf3..5073d2ed78bb 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/emma/common/prom.c b/arch/mips/emma/common/prom.c index cae42259d6da..675337b8a4a0 100644 --- a/arch/mips/emma/common/prom.c +++ b/arch/mips/emma/common/prom.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/fw/arc/memory.c b/arch/mips/fw/arc/memory.c index dd9496f26e6a..429b7f8d2aeb 100644 --- a/arch/mips/fw/arc/memory.c +++ b/arch/mips/fw/arc/memory.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index 0a0aaf39fd16..4c41ed0a637e 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c index 2c7288041a99..81845ba04835 100644 --- a/arch/mips/kernel/crash.c +++ b/arch/mips/kernel/crash.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c index a8657d29c62e..01b2bd95ba1f 100644 --- a/arch/mips/kernel/crash_dump.c +++ b/arch/mips/kernel/crash_dump.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c index 89950b7bf536..93b8e0b4332f 100644 --- a/arch/mips/kernel/prom.c +++ b/arch/mips/kernel/prom.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 31522d3bc8bf..41c1683761bb 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 623dc18f7f2f..0f852e1b5891 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 0bef238d2c0c..6176b9acba95 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c index f43629979a0e..5812e6145801 100644 --- a/arch/mips/kvm/commpage.c +++ b/arch/mips/kvm/commpage.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index f8e772564d74..d77b61b3d6ee 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include "commpage.h" diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 4144bfaef137..ec9ed23bca7f 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index aa0a1a00faf6..7257e8b6f5a9 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index f7ea8e21656b..1fcc4d149054 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c index d984bd5c2ec5..14d4c5e2b42f 100644 --- a/arch/mips/lantiq/prom.c +++ b/arch/mips/lantiq/prom.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/lasat/prom.c b/arch/mips/lasat/prom.c index 37b8fc5b9ac9..5ce1407de2d5 100644 --- a/arch/mips/lasat/prom.c +++ b/arch/mips/lasat/prom.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/loongson64/common/init.c b/arch/mips/loongson64/common/init.c index 6ef17120722f..c073fbcb9805 100644 --- a/arch/mips/loongson64/common/init.c +++ b/arch/mips/loongson64/common/init.c @@ -8,7 +8,7 @@ * option) any later version. */ -#include +#include #include #include #include diff --git a/arch/mips/loongson64/loongson-3/numa.c b/arch/mips/loongson64/loongson-3/numa.c index 703ad4536fe0..622761878cd1 100644 --- a/arch/mips/loongson64/loongson-3/numa.c +++ b/arch/mips/loongson64/loongson-3/numa.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 0893b6136498..b521d8e2d359 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index b19a3c506b1e..e2a33adc0f29 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/mips/mti-malta/malta-memory.c b/arch/mips/mti-malta/malta-memory.c index a47556723b85..868921adef1d 100644 --- a/arch/mips/mti-malta/malta-memory.c +++ b/arch/mips/mti-malta/malta-memory.c @@ -12,7 +12,7 @@ * Steven J. Hill */ #include -#include +#include #include #include diff --git a/arch/mips/netlogic/xlp/dt.c b/arch/mips/netlogic/xlp/dt.c index b5ba83f4c646..c856f2a3ea42 100644 --- a/arch/mips/netlogic/xlp/dt.c +++ b/arch/mips/netlogic/xlp/dt.c @@ -33,7 +33,7 @@ */ #include -#include +#include #include #include diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c index 3c3b1e6abb53..687513880fbf 100644 --- a/arch/mips/pci/pci-legacy.c +++ b/arch/mips/pci/pci-legacy.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c index c2e94cf5ecda..e68b44b27c0d 100644 --- a/arch/mips/pci/pci.c +++ b/arch/mips/pci/pci.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/ralink/of.c b/arch/mips/ralink/of.c index 1ada8492733b..d544e7b07f7a 100644 --- a/arch/mips/ralink/of.c +++ b/arch/mips/ralink/of.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/rb532/prom.c b/arch/mips/rb532/prom.c index 6484e4a4597b..361a690facbf 100644 --- a/arch/mips/rb532/prom.c +++ b/arch/mips/rb532/prom.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index cb1f1a6a166d..d8b8444d6795 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/sibyte/common/cfe.c b/arch/mips/sibyte/common/cfe.c index 092fb2a6ec4a..12a780f251e1 100644 --- a/arch/mips/sibyte/common/cfe.c +++ b/arch/mips/sibyte/common/cfe.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 152ca71cc2d7..3b034b7178d6 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/txx9/rbtx4938/prom.c b/arch/mips/txx9/rbtx4938/prom.c index bcb469247e8c..2b36a2ee744c 100644 --- a/arch/mips/txx9/rbtx4938/prom.c +++ b/arch/mips/txx9/rbtx4938/prom.c @@ -11,7 +11,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index 63a1a5ef5219..eacc79024879 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -2,9 +2,8 @@ // Copyright (C) 2005-2017 Andes Technology Corporation #include -#include -#include #include +#include #include #include #include diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index e17cb8a69315..022779af6148 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 66d3e9cf498d..131104bd2538 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -7,12 +7,11 @@ #include #include #include -#include +#include #include #include #include #include -#include #include #include diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index a6d4f7530247..232a36b511aa 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 2d0011ddd4d5..6bbd4ae2beb0 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 12923501d94f..16cea5776b87 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index e17fcd83120f..c605bdad1746 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -30,13 +30,12 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 91a6a9ab7598..d157310eb377 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -26,12 +26,11 @@ #include #include #include -#include +#include #include #include #include /* for initrd_* */ #include -#include #include #include diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 7e7a3126c5e9..2d7cffcaa476 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index d39ec3a4550a..274bd1442dd9 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2b56d1f30387..93ee3703b42f 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 9216c3a7fcfc..2a51e4cc8246 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -29,10 +29,9 @@ #include #include #include -#include +#include #include #include -#include #include #include diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index bf87d6e13369..5b61704447c1 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a7226ed9cae6..8cf035e68378 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b3fe79064a69..0a64fffabee1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -27,12 +27,11 @@ #include #include #include -#include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 954f1986af4d..67b9d7b669a1 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f04f15f9d232..3a048e98a132 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -11,7 +11,7 @@ #define pr_fmt(fmt) "numa: " fmt #include -#include +#include #include #include #include @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index f45b369177a4..f3391be7c762 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 23a67b545b70..aba81cbf0b36 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -17,11 +17,10 @@ #include #include #include -#include +#include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 12519857a33c..658bfab3350b 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 349a9ff6ca5b..2444feda831f 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index d58c111099b3..1d9bfaff60bc 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -13,9 +13,8 @@ #include #include -#include -#include #include +#include #include #include diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d17566a8c76f..97eae3871868 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -13,10 +13,9 @@ #include #include #include -#include +#include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 781c1053a773..72dd23ef771b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 44f9a7d6450b..f82b3d3c36e2 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -20,7 +20,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include -#include +#include #include #include #include @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 799a91882a76..8992b04c0ade 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -8,7 +8,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ec31b48a42a5..ebe748a9f472 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 84111a43ea29..eba2def3414d 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 873f6ee1c46d..76d0708438e9 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 04638b0b9ef1..0472e27febdf 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,14 +4,13 @@ * Author(s): Heiko Carstens */ -#include +#include #include #include #include #include #include #include -#include #include #include #include diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c index 5a381fc8e958..bfba273c32c0 100644 --- a/arch/s390/numa/mode_emu.c +++ b/arch/s390/numa/mode_emu.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c index 297f5d8b0890..ae0d9e889534 100644 --- a/arch/s390/numa/numa.c +++ b/arch/s390/numa/numa.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c index 7f61cc3fd4d1..71a608cd4f61 100644 --- a/arch/s390/numa/toptree.c +++ b/arch/s390/numa/toptree.c @@ -8,7 +8,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 21447f866415..c8c13c777162 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -11,12 +11,11 @@ #include #include #include -#include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/sh/mm/ioremap_fixed.c b/arch/sh/mm/ioremap_fixed.c index 927a1294c465..07e744d75fa0 100644 --- a/arch/sh/mm/ioremap_fixed.c +++ b/arch/sh/mm/ioremap_fixed.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index a41526bd91e2..9a26b442f820 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -5,13 +5,11 @@ */ #include #include -#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 4389944735c6..d41e2a749c5d 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index de7c87b153fe..cd2825cb8420 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 6cc80d0f4b9f..4792e08ad36b 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 880714565c40..d900952bfc5f 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index a8c3453195e6..3c8aac21f426 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index b48fea5ad9ef..a6142c5abf61 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index ef19a391214f..673816880cce 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -6,7 +6,7 @@ * Licensed under the GPL. */ -#include +#include #include #include #include diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 20442d20bd09..2b4dded11a7a 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 844056cf313e..3678f5b05e42 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2c672a8f4571..1067469ba2ea 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -5,7 +5,6 @@ #include #include -#include #include #include #include diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 296a91a04598..5bf56af4d5b9 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c index 9969ec374abb..29b71c68eb7c 100644 --- a/arch/unicore32/kernel/hibernate.c +++ b/arch/unicore32/kernel/hibernate.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c index 9f163f976315..b2c38b32ea57 100644 --- a/arch/unicore32/kernel/setup.c +++ b/arch/unicore32/kernel/setup.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 44fd0e8fbe87..cf4eb9481fd6 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -11,13 +11,12 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index 18b355a20f0b..040a8c279761 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f5ea6415b778..7f5d212551d4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index f1915b744052..ca13851f0570 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ab731ab09f06..32b2b7a41ef5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c7450900e0e..5fbc57e4b0b9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -47,7 +47,7 @@ #include #include /* time_after() */ #include -#include +#include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 660d0b22e962..cbbd57ae06ee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,7 +1,7 @@ /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 -#include +#include #include #include #include diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a0ec4c37265a..68ff62bffbab 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -9,11 +9,10 @@ * allocation code routines via a platform independent interface (memblock, etc.). */ #include -#include +#include #include #include #include -#include #include #include diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index f1c5eb99d445..3482460d984d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 7ba73fe0d917..f4562fcec681 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 71c0b01d93b1..bd08b9e1c9e2 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 637982efecd8..9b158b4716d2 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7005f89bf3b2..b74e7bfed6ab 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 483412fb8a24..e8796fcd7e5a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5369d7fac797..a9134d1910b9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 75730ce01f8d..285aaa62d153 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 048c761d97b0..058b2f36b3a6 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b24eb4eb9984..71d4b9d4d43f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,7 +8,7 @@ #include /* task_stack_*(), ... */ #include /* oops_begin/end, ... */ #include /* search_exception_tables */ -#include /* max_low_pfn */ +#include /* max_low_pfn */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 62915a5e0fa2..0d4bdcb84da5 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,7 +1,7 @@ #include #include #include /* for totalram_pages */ -#include +#include void *kmap(struct page *page) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index faca978ebf9d..ef99f3892e1f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,7 +3,6 @@ #include #include #include -#include /* for max_low_pfn */ #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3bbe5f58a67d..49ecf5ecf6d3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bfb0bedc21d3..5fab264948c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24e0920a9b25..5378d10f1d31 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -6,7 +6,7 @@ * (C) Copyright 1995 1996 Linus Torvalds */ -#include +#include #include #include #include diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8f87499124b8..04a9cf6b034f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -5,10 +5,9 @@ /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 -#include +#include #include #include -#include #include #include #include diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 61db77b0eda9..3f452ffed7e9 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 16e37d712ffd..1308f5408bf7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index e8a4a09e20f1..f2bd3d61e16b 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,7 +22,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 066f3511d5f1..59d80160fa5a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -3,7 +3,7 @@ * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ -#include +#include #include "numa_internal.h" diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index b54d52a2d00a..a80fdd7fb40f 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include "numa_internal.h" diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index a25588ad75ef..08f8f76a4852 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -5,7 +5,7 @@ * Clears the a test pte bit on random pages in the direct mapping, * then reverts and compares page tables forwards and afterwards. */ -#include +#include #include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 62bb30b4bd2a..f799076e3d57 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -3,7 +3,7 @@ * Thanks to Ben LaHaise for precious feedback. */ #include -#include +#include #include #include #include diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3d0c83ef6aab..08013524fba1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -8,7 +8,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index 7f9acb68324c..bdc98150d4db 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index ed4ac215305d..8cd66152cdb0 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 9061babfbc83..7ae939e353cd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -36,9 +36,8 @@ #include #include #include -#include -#include #include +#include #include #include #include diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e8da7f492970..cf0347f61b21 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 4b70d0f5a803..95e77a667ba5 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 140cd76ee897..115c8e4173bb 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -17,7 +17,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 15695e30f982..be15bdcb20df 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 67b2f31a1265..e996e8e744cb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -#include +#include #endif #include #include diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ec7a4209f310..2f6787fc7106 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -31,7 +31,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b3e11afed25b..b06731705529 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c index 21f13e9aabe1..5ca440a74316 100644 --- a/arch/xtensa/kernel/pci.c +++ b/arch/xtensa/kernel/pci.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 9220dcde7520..b27359e2a464 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index f7fbe6334939..9750a48f491b 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index 1a30a258ccd0..6b95ca43aec0 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -8,11 +8,10 @@ * Copyright (C) 2017 Cadence Design Systems Inc. */ -#include +#include #include #include #include -#include #include #include #include diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index f33a1ff2662a..a4dcfd39bc5c 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -4,7 +4,7 @@ * * Extracted from init.c */ -#include +#include #include #include #include diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 206b9d4591e8..190846dddc67 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c index 58709e89a8ed..c14cc673976c 100644 --- a/arch/xtensa/platforms/iss/setup.c +++ b/arch/xtensa/platforms/iss/setup.c @@ -16,7 +16,7 @@ * option) any later version. * */ -#include +#include #include #include #include diff --git a/block/blk-settings.c b/block/blk-settings.c index ffd459969689..696c04c1ab6c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -6,7 +6,7 @@ #include #include #include -#include /* for max_pfn/max_low_pfn */ +#include /* for max_pfn/max_low_pfn */ #include #include #include diff --git a/block/bounce.c b/block/bounce.c index ec0d99995f5f..cf49fe02f65c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 85167603b9c9..274699463b4f 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index a3d012b08fc5..61203eebf3a1 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -31,9 +31,8 @@ #include #include #include -#include -#include #include +#include #include #include "internal.h" diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 23cf4427f425..41b91af95afb 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index 5c54d3734daf..5b2867a33b98 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include "clock.h" diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index f2483548cde9..099d83e4e910 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/firmware/efi/apple-properties.c b/drivers/firmware/efi/apple-properties.c index 2b675f788b61..ac1654f74dc7 100644 --- a/drivers/firmware/efi/apple-properties.c +++ b/drivers/firmware/efi/apple-properties.c @@ -20,7 +20,7 @@ #define pr_fmt(fmt) "apple-properties: " fmt -#include +#include #include #include #include diff --git a/drivers/firmware/iscsi_ibft_find.c b/drivers/firmware/iscsi_ibft_find.c index 2224f1dc074b..72d9ea18270b 100644 --- a/drivers/firmware/iscsi_ibft_find.c +++ b/drivers/firmware/iscsi_ibft_find.c @@ -18,7 +18,7 @@ * GNU General Public License for more details. */ -#include +#include #include #include #include diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 03cead6d5f97..2a23453f005a 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index f9f69f7111a9..44bd5b9166bb 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ -#include +#include #include #include #include diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 676c029494e4..0e780848f59b 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -13,7 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ -#include +#include #include #include #include diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 0069f9084f9f..880a81c82b7a 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/drivers/mtd/ar7part.c b/drivers/mtd/ar7part.c index fc15ec58230a..0d33cf0842ad 100644 --- a/drivers/mtd/ar7part.c +++ b/drivers/mtd/ar7part.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c index a07e24970be4..11c5bad95226 100644 --- a/drivers/net/arcnet/arc-rimi.c +++ b/drivers/net/arcnet/arc-rimi.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c index 38fa60ddaf2e..28510e33924f 100644 --- a/drivers/net/arcnet/com20020-isa.c +++ b/drivers/net/arcnet/com20020-isa.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include "arcdevice.h" diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c index 4e56aaf2b984..2c546013a980 100644 --- a/drivers/net/arcnet/com90io.c +++ b/drivers/net/arcnet/com90io.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index ffe62a7ae19b..bb532aae0d92 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 01e23b85e798..49ae2aa744d6 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -5,7 +5,7 @@ #define pr_fmt(fmt) "### dt-test ### " fmt -#include +#include #include #include #include diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c index 16a4e8528bbc..8f3a2eeb28dc 100644 --- a/drivers/s390/char/fs3270.c +++ b/drivers/s390/char/fs3270.c @@ -8,7 +8,7 @@ * Copyright IBM Corp. 2003, 2009 */ -#include +#include #include #include #include diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c index 5b8af2782282..2b0c36c2c568 100644 --- a/drivers/s390/char/tty3270.c +++ b/drivers/s390/char/tty3270.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c index 8af4948dae80..72dd2471ec1e 100644 --- a/drivers/s390/cio/cmf.c +++ b/drivers/s390/cio/cmf.c @@ -13,7 +13,7 @@ #define KMSG_COMPONENT "cio" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include +#include #include #include #include diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 8f5c1d7f751a..97b6f197f007 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c index 153b3f3cc795..a5136901dd8a 100644 --- a/drivers/sfi/sfi_core.c +++ b/drivers/sfi/sfi_core.c @@ -59,7 +59,7 @@ #define KMSG_COMPONENT "SFI" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include +#include #include #include #include diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c index 79ad30d34949..b929c7ae3a27 100644 --- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_cpm1.c b/drivers/tty/serial/cpm_uart/cpm_uart_cpm1.c index 4eba17f3d293..56fc527015cb 100644 --- a/drivers/tty/serial/cpm_uart/cpm_uart_cpm1.c +++ b/drivers/tty/serial/cpm_uart/cpm_uart_cpm1.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/tty/serial/cpm_uart/cpm_uart_cpm2.c index e3bff068dc3c..6a1cd03bfe39 100644 --- a/drivers/tty/serial/cpm_uart/cpm_uart_cpm2.c +++ b/drivers/tty/serial/cpm_uart/cpm_uart_cpm2.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c index ddc5fa88f268..d2652dccc699 100644 --- a/drivers/usb/early/xhci-dbc.c +++ b/drivers/usb/early/xhci-dbc.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index e12bb256036f..a3f5cbfcd4a1 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index e6c1934734b7..93194f3e7540 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 84575baceebc..f15f89df1f36 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,7 +33,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt -#include +#include #include #include #include diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index c5f26a87d238..2a7f545bd0b5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -35,7 +35,6 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt -#include #include #include #include diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 55988b8418ee..5165aa82bf7d 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -68,7 +68,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include diff --git a/fs/dcache.c b/fs/dcache.c index c2e443fb76ae..2593153471cf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/inode.c b/fs/inode.c index 9b808986d440..9e198f00b64c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/namespace.c b/fs/namespace.c index d86830c86ce8..98d27da43304 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d297fe4472a9..bbcc185062bb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/proc/page.c b/fs/proc/page.c index 792c78a49174..6c517b11acf8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0377a104495b..3fe90443c1bb 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h deleted file mode 100644 index b58873a567b2..000000000000 --- a/include/linux/bootmem.h +++ /dev/null @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 - */ -#ifndef _LINUX_BOOTMEM_H -#define _LINUX_BOOTMEM_H - -#include -#include -#include -#include - -/* - * simple boot-time physical memory area allocator. - */ - -extern unsigned long max_low_pfn; -extern unsigned long min_low_pfn; - -/* - * highest page - */ -extern unsigned long max_pfn; -/* - * highest possible page - */ -extern unsigned long long max_possible_pfn; - -extern unsigned long memblock_free_all(void); -extern void reset_node_managed_pages(pg_data_t *pgdat); -extern void reset_all_zones_managed_pages(void); - -/* We are using top down, so it is safe to use 0 here */ -#define BOOTMEM_LOW_LIMIT 0 - -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - -/* FIXME: use MEMBLOCK_ALLOC_* variants here */ -#define BOOTMEM_ALLOC_ACCESSIBLE 0 -#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) - -/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ -void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, - phys_addr_t min_addr, - phys_addr_t max_addr, int nid); -void *memblock_alloc_try_nid_nopanic(phys_addr_t size, - phys_addr_t align, phys_addr_t min_addr, - phys_addr_t max_addr, int nid); -void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, - phys_addr_t min_addr, phys_addr_t max_addr, int nid); -void __memblock_free_early(phys_addr_t base, phys_addr_t size); -void __memblock_free_late(phys_addr_t base, phys_addr_t size); - -static inline void * __init memblock_alloc( - phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, - BOOTMEM_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_raw( - phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, - BOOTMEM_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_from( - phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) -{ - return memblock_alloc_try_nid(size, align, min_addr, - BOOTMEM_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_nopanic( - phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_try_nid_nopanic(size, align, - BOOTMEM_LOW_LIMIT, - BOOTMEM_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_low( - phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_try_nid(size, align, - BOOTMEM_LOW_LIMIT, - ARCH_LOW_ADDRESS_LIMIT, - NUMA_NO_NODE); -} -static inline void * __init memblock_alloc_low_nopanic( - phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_try_nid_nopanic(size, align, - BOOTMEM_LOW_LIMIT, - ARCH_LOW_ADDRESS_LIMIT, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_from_nopanic( - phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) -{ - return memblock_alloc_try_nid_nopanic(size, align, min_addr, - BOOTMEM_ALLOC_ACCESSIBLE, - NUMA_NO_NODE); -} - -static inline void * __init memblock_alloc_node( - phys_addr_t size, phys_addr_t align, int nid) -{ - return memblock_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, - BOOTMEM_ALLOC_ACCESSIBLE, nid); -} - -static inline void * __init memblock_alloc_node_nopanic( - phys_addr_t size, int nid) -{ - return memblock_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, - BOOTMEM_ALLOC_ACCESSIBLE, - nid); -} - -static inline void __init memblock_free_early( - phys_addr_t base, phys_addr_t size) -{ - __memblock_free_early(base, size); -} - -static inline void __init memblock_free_early_nid( - phys_addr_t base, phys_addr_t size, int nid) -{ - __memblock_free_early(base, size); -} - -static inline void __init memblock_free_late( - phys_addr_t base, phys_addr_t size) -{ - __memblock_free_late(base, size); -} - -extern void *alloc_large_system_hash(const char *tablename, - unsigned long bucketsize, - unsigned long numentries, - int scale, - int flags, - unsigned int *_hash_shift, - unsigned int *_hash_mask, - unsigned long low_limit, - unsigned long high_limit); - -#define HASH_EARLY 0x00000001 /* Allocating during early boot? */ -#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min - * shift passed via *_hash_shift */ -#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ - -/* Only NUMA needs hash distribution. 64bit NUMA architectures have - * sufficient vmalloc space. - */ -#ifdef CONFIG_NUMA -#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) -extern int hashdist; /* Distribute hashes across NUMA nodes? */ -#else -#define hashdist (0) -#endif - - -#endif /* _LINUX_BOOTMEM_H */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9d46a7204975..1b4d85879cbe 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -15,6 +15,19 @@ #include #include +#include + +extern unsigned long max_low_pfn; +extern unsigned long min_low_pfn; + +/* + * highest page + */ +extern unsigned long max_pfn; +/* + * highest possible page + */ +extern unsigned long long max_possible_pfn; #define INIT_MEMBLOCK_REGIONS 128 #define INIT_PHYSMEM_REGIONS 4 @@ -119,6 +132,10 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); enum memblock_flags choose_memblock_flags(void); +unsigned long memblock_free_all(void); +void reset_node_managed_pages(pg_data_t *pgdat); +void reset_all_zones_managed_pages(void); + /* Low level functions */ int memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, @@ -300,11 +317,116 @@ static inline int memblock_get_region_node(const struct memblock_region *r) } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +/* Flags for memblock allocation APIs */ +#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) +#define MEMBLOCK_ALLOC_ACCESSIBLE 0 + +/* We are using top down, so it is safe to use 0 here */ +#define MEMBLOCK_LOW_LIMIT 0 + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + phys_addr_t memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); phys_addr_t memblock_phys_alloc(phys_addr_t size, phys_addr_t align); +void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); +void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); +void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); + +static inline void * __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_raw(phys_addr_t size, + phys_addr_t align) +{ + return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_from(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr) +{ + return memblock_alloc_try_nid(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_nopanic(phys_addr_t size, + phys_addr_t align) +{ + return memblock_alloc_try_nid_nopanic(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_low(phys_addr_t size, + phys_addr_t align) +{ + return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, + ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE); +} +static inline void * __init memblock_alloc_low_nopanic(phys_addr_t size, + phys_addr_t align) +{ + return memblock_alloc_try_nid_nopanic(size, align, MEMBLOCK_LOW_LIMIT, + ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_from_nopanic(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr) +{ + return memblock_alloc_try_nid_nopanic(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + +static inline void * __init memblock_alloc_node(phys_addr_t size, + phys_addr_t align, int nid) +{ + return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} + +static inline void * __init memblock_alloc_node_nopanic(phys_addr_t size, + int nid) +{ + return memblock_alloc_try_nid_nopanic(size, 0, MEMBLOCK_LOW_LIMIT, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} + +static inline void __init memblock_free_early(phys_addr_t base, + phys_addr_t size) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_early_nid(phys_addr_t base, + phys_addr_t size, int nid) +{ + __memblock_free_early(base, size); +} + +static inline void __init memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + __memblock_free_late(base, size); +} + /* * Set the allocation direction to bottom-up or top-down. */ @@ -323,10 +445,6 @@ static inline bool memblock_bottom_up(void) return memblock.bottom_up; } -/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ -#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) -#define MEMBLOCK_ALLOC_ACCESSIBLE 0 - phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, enum memblock_flags flags); @@ -432,6 +550,31 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo i < memblock_type->cnt; \ i++, rgn = &memblock_type->regions[i]) +extern void *alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit); + +#define HASH_EARLY 0x00000001 /* Allocating during early boot? */ +#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min + * shift passed via *_hash_shift */ +#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ + +/* Only NUMA needs hash distribution. 64bit NUMA architectures have + * sufficient vmalloc space. + */ +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ +#else +#define hashdist (0) +#endif + #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); #else diff --git a/init/main.c b/init/main.c index 8cef69c61389..51b8e7b8ae5b 100644 --- a/init/main.c +++ b/init/main.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f14c376937e5..22a12ab5a5e9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -4,7 +4,7 @@ * * DMA operations that map physical memory directly without using an IOMMU. */ -#include /* for max_pfn */ +#include /* for max_pfn */ #include #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 801da67e957b..5731daa09a32 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #define CREATE_TRACE_POINTS diff --git a/kernel/futex.c b/kernel/futex.c index 3e2de8fc1891..f423f9b6577e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 0130e488ebfe..8f36c27c1794 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,7 +4,7 @@ #endif #include -#include +#include #include /* diff --git a/kernel/pid.c b/kernel/pid.c index cdf63e53a014..b2f6c506035d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 34116a6097be..3c9e365438ad 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 429e4a3833ca..1b2a029360b7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 9aa2a4445b0d..9c08a2c7cb1d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/lib/cpumask.c b/lib/cpumask.c index 1405cb22e6bc..75b5e7672c4c 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include /** * cpumask_next - get the next cpu in a cpumask diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e35d99844612..c007fb5fb8d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 785a9707786b..c7550eb65922 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -10,11 +10,10 @@ * */ -#include +#include #include #include #include -#include #include #include #include diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4f7e4b5a2f08..877de4fa0720 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,7 +92,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/memblock.c b/mm/memblock.c index 2ed73245b5da..c655342569f8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7e6509a53d79..41e326472ef9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f3291601945..ef289fadec0e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/page_ext.c b/mm/page_ext.c index 5323c2ade686..ae44f7adbe07 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include +#include #include #include #include diff --git a/mm/page_idle.c b/mm/page_idle.c index 6302bc62c27d..b9e4b42b33ab 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include +#include #include #include #include diff --git a/mm/page_owner.c b/mm/page_owner.c index d80adfe702d3..87bc0dfdb52b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/percpu.c b/mm/percpu.c index 3050c1d37d37..61cdbb3b3736 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -65,7 +65,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 7408cabed61a..7fec05796796 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,7 +20,6 @@ */ #include #include -#include #include #include #include diff --git a/mm/sparse.c b/mm/sparse.c index b139fbc61d10..ab2ac45e0440 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f5c9ef2586de..411dd7a90046 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1834818ed07b..9e6bc4d6daa7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,7 +262,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ca3ed931f2a9..1976fddb9e00 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -81,7 +81,7 @@ #include #include -#include +#include #include #include #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e948db29ab53..9b277bd36d1a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index 2ad33ce1ea17..eca8d84d99bf 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 7e1c4e27928e5f87b9b1eaf06dc31773b2f1e7f1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 30 Oct 2018 15:09:57 -0700 Subject: memblock: stop using implicit alignment to SMP_CACHE_BYTES When a memblock allocation APIs are called with align = 0, the alignment is implicitly set to SMP_CACHE_BYTES. Implicit alignment is done deep in the memblock allocator and it can come as a surprise. Not that such an alignment would be wrong even when used incorrectly but it is better to be explicit for the sake of clarity and the prinicple of the least surprise. Replace all such uses of memblock APIs with the 'align' parameter explicitly set to SMP_CACHE_BYTES and stop implicit alignment assignment in the memblock internal allocation functions. For the case when memblock APIs are used via helper functions, e.g. like iommu_arena_new_node() in Alpha, the helper functions were detected with Coccinelle's help and then manually examined and updated where appropriate. The direct memblock APIs users were updated using the semantic patch below: @@ expression size, min_addr, max_addr, nid; @@ ( | - memblock_alloc_try_nid_raw(size, 0, min_addr, max_addr, nid) + memblock_alloc_try_nid_raw(size, SMP_CACHE_BYTES, min_addr, max_addr, nid) | - memblock_alloc_try_nid_nopanic(size, 0, min_addr, max_addr, nid) + memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, min_addr, max_addr, nid) | - memblock_alloc_try_nid(size, 0, min_addr, max_addr, nid) + memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, nid) | - memblock_alloc(size, 0) + memblock_alloc(size, SMP_CACHE_BYTES) | - memblock_alloc_raw(size, 0) + memblock_alloc_raw(size, SMP_CACHE_BYTES) | - memblock_alloc_from(size, 0, min_addr) + memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr) | - memblock_alloc_nopanic(size, 0) + memblock_alloc_nopanic(size, SMP_CACHE_BYTES) | - memblock_alloc_low(size, 0) + memblock_alloc_low(size, SMP_CACHE_BYTES) | - memblock_alloc_low_nopanic(size, 0) + memblock_alloc_low_nopanic(size, SMP_CACHE_BYTES) | - memblock_alloc_from_nopanic(size, 0, min_addr) + memblock_alloc_from_nopanic(size, SMP_CACHE_BYTES, min_addr) | - memblock_alloc_node(size, 0, nid) + memblock_alloc_node(size, SMP_CACHE_BYTES, nid) ) [mhocko@suse.com: changelog update] [akpm@linux-foundation.org: coding-style fixes] [rppt@linux.ibm.com: fix missed uses of implicit alignment] Link: http://lkml.kernel.org/r/20181016133656.GA10925@rapoport-lnx Link: http://lkml.kernel.org/r/1538687224-17535-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Suggested-by: Michal Hocko Acked-by: Paul Burton [MIPS] Acked-by: Michael Ellerman [powerpc] Acked-by: Michal Hocko Cc: Catalin Marinas Cc: Chris Zankel Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Ingo Molnar Cc: Matt Turner Cc: Michal Simek Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_apecs.c | 3 ++- arch/alpha/kernel/core_lca.c | 3 ++- arch/alpha/kernel/core_marvel.c | 4 ++-- arch/alpha/kernel/core_mcpcia.c | 6 +++-- arch/alpha/kernel/core_t2.c | 2 +- arch/alpha/kernel/core_titan.c | 6 +++-- arch/alpha/kernel/core_tsunami.c | 6 +++-- arch/alpha/kernel/core_wildfire.c | 6 +++-- arch/alpha/kernel/pci-noop.c | 4 ++-- arch/alpha/kernel/pci.c | 4 ++-- arch/alpha/kernel/pci_iommu.c | 4 ++-- arch/arm/kernel/setup.c | 4 ++-- arch/arm/mach-omap2/omap_hwmod.c | 8 ++++--- arch/arm64/kernel/setup.c | 2 +- arch/ia64/kernel/mca.c | 4 ++-- arch/ia64/mm/tlb.c | 6 +++-- arch/ia64/sn/kernel/io_common.c | 4 +++- arch/ia64/sn/kernel/setup.c | 5 ++-- arch/m68k/sun3/sun3dvma.c | 2 +- arch/microblaze/mm/init.c | 2 +- arch/mips/kernel/setup.c | 2 +- arch/powerpc/kernel/paca.c | 2 +- arch/powerpc/kernel/pci_32.c | 3 ++- arch/powerpc/lib/alloc.c | 2 +- arch/powerpc/mm/mmu_context_nohash.c | 7 +++--- arch/powerpc/platforms/powermac/nvram.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++--- arch/powerpc/sysdev/msi_bitmap.c | 2 +- arch/um/drivers/net_kern.c | 2 +- arch/um/drivers/vector_kern.c | 2 +- arch/um/kernel/initrd.c | 2 +- arch/unicore32/kernel/setup.c | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/e820.c | 3 ++- arch/x86/platform/olpc/olpc_dt.c | 2 +- arch/xtensa/platforms/iss/network.c | 2 +- drivers/clk/ti/clk.c | 2 +- drivers/firmware/efi/memmap.c | 2 +- drivers/firmware/memmap.c | 3 ++- drivers/macintosh/smu.c | 2 +- drivers/of/of_reserved_mem.c | 1 + include/linux/memblock.h | 3 ++- init/main.c | 13 +++++++---- kernel/power/snapshot.c | 3 ++- lib/cpumask.c | 2 +- mm/memblock.c | 8 ------- mm/page_alloc.c | 6 +++-- mm/percpu.c | 38 ++++++++++++++++--------------- mm/sparse.c | 3 ++- 50 files changed, 120 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/core_apecs.c b/arch/alpha/kernel/core_apecs.c index 1bf3eef34c22..6df765ff2b10 100644 --- a/arch/alpha/kernel/core_apecs.c +++ b/arch/alpha/kernel/core_apecs.c @@ -346,7 +346,8 @@ apecs_init_arch(void) * Window 1 is direct access 1GB at 1GB * Window 2 is scatter-gather 8MB at 8MB (for isa) */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); hose->sg_pci = NULL; __direct_map_base = 0x40000000; __direct_map_size = 0x40000000; diff --git a/arch/alpha/kernel/core_lca.c b/arch/alpha/kernel/core_lca.c index 81c0c43635b0..57e0750419f2 100644 --- a/arch/alpha/kernel/core_lca.c +++ b/arch/alpha/kernel/core_lca.c @@ -275,7 +275,8 @@ lca_init_arch(void) * Note that we do not try to save any of the DMA window CSRs * before setting them, since we cannot read those CSRs on LCA. */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); hose->sg_pci = NULL; __direct_map_base = 0x40000000; __direct_map_size = 0x40000000; diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 8a568c4d8e81..c1d0c18c71ca 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -82,7 +82,7 @@ mk_resource_name(int pe, int port, char *str) char *name; sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc(strlen(tmp) + 1, 0); + name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES); strcpy(name, tmp); return name; @@ -117,7 +117,7 @@ alloc_io7(unsigned int pe) return NULL; } - io7 = memblock_alloc(sizeof(*io7), 0); + io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES); io7->pe = pe; raw_spin_lock_init(&io7->irq_lock); diff --git a/arch/alpha/kernel/core_mcpcia.c b/arch/alpha/kernel/core_mcpcia.c index b1549db54260..74b1d018124c 100644 --- a/arch/alpha/kernel/core_mcpcia.c +++ b/arch/alpha/kernel/core_mcpcia.c @@ -364,9 +364,11 @@ mcpcia_startup_hose(struct pci_controller *hose) * Window 1 is scatter-gather (up to) 1GB at 1GB (for pci) * Window 2 is direct access 2GB at 2GB */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); hose->sg_pci = iommu_arena_new(hose, 0x40000000, - size_for_memory(0x40000000), 0); + size_for_memory(0x40000000), + SMP_CACHE_BYTES); __direct_map_base = 0x80000000; __direct_map_size = 0x80000000; diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c index 2c00b61ca379..98d5b6ff8a76 100644 --- a/arch/alpha/kernel/core_t2.c +++ b/arch/alpha/kernel/core_t2.c @@ -351,7 +351,7 @@ t2_sg_map_window2(struct pci_controller *hose, /* Note we can only do 1 SG window, as the other is for direct, so do an ISA SG area, especially for the floppy. */ - hose->sg_isa = iommu_arena_new(hose, base, length, 0); + hose->sg_isa = iommu_arena_new(hose, base, length, SMP_CACHE_BYTES); hose->sg_pci = NULL; temp = (base & 0xfff00000UL) | ((base + length - 1) >> 20); diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c index 97551597581b..2a2820fb1be6 100644 --- a/arch/alpha/kernel/core_titan.c +++ b/arch/alpha/kernel/core_titan.c @@ -316,10 +316,12 @@ titan_init_one_pachip_port(titan_pachip_port *port, int index) * Window 1 is direct access 1GB at 2GB * Window 2 is scatter-gather 1GB at 3GB */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); hose->sg_isa->align_entry = 8; /* 64KB for ISA */ - hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x40000000, 0); + hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x40000000, + SMP_CACHE_BYTES); hose->sg_pci->align_entry = 4; /* Titan caches 4 PTEs at a time */ port->wsba[0].csr = hose->sg_isa->dma_base | 3; diff --git a/arch/alpha/kernel/core_tsunami.c b/arch/alpha/kernel/core_tsunami.c index f334b8928d72..fc1ab73f23de 100644 --- a/arch/alpha/kernel/core_tsunami.c +++ b/arch/alpha/kernel/core_tsunami.c @@ -319,12 +319,14 @@ tsunami_init_one_pchip(tsunami_pchip *pchip, int index) * NOTE: we need the align_entry settings for Acer devices on ES40, * specifically floppy and IDE when memory is larger than 2GB. */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); /* Initially set for 4 PTEs, but will be overridden to 64K for ISA. */ hose->sg_isa->align_entry = 4; hose->sg_pci = iommu_arena_new(hose, 0x40000000, - size_for_memory(0x40000000), 0); + size_for_memory(0x40000000), + SMP_CACHE_BYTES); hose->sg_pci->align_entry = 4; /* Tsunami caches 4 PTEs at a time */ __direct_map_base = 0x80000000; diff --git a/arch/alpha/kernel/core_wildfire.c b/arch/alpha/kernel/core_wildfire.c index cad36fc6ed7d..353c03d15442 100644 --- a/arch/alpha/kernel/core_wildfire.c +++ b/arch/alpha/kernel/core_wildfire.c @@ -111,8 +111,10 @@ wildfire_init_hose(int qbbno, int hoseno) * ??? We ought to scale window 3 memory. * */ - hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, 0); - hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, 0); + hose->sg_isa = iommu_arena_new(hose, 0x00800000, 0x00800000, + SMP_CACHE_BYTES); + hose->sg_pci = iommu_arena_new(hose, 0xc0000000, 0x08000000, + SMP_CACHE_BYTES); pci = WILDFIRE_pci(qbbno, hoseno); diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index a9378ee0c2f1..091cff3c68fd 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -33,7 +33,7 @@ alloc_pci_controller(void) { struct pci_controller *hose; - hose = memblock_alloc(sizeof(*hose), 0); + hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES); *hose_tail = hose; hose_tail = &hose->next; @@ -44,7 +44,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - return memblock_alloc(sizeof(struct resource), 0); + return memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); } SYSCALL_DEFINE3(pciconfig_iobase, long, which, unsigned long, bus, diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 13937e72d875..97098127df83 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -392,7 +392,7 @@ alloc_pci_controller(void) { struct pci_controller *hose; - hose = memblock_alloc(sizeof(*hose), 0); + hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES); *hose_tail = hose; hose_tail = &hose->next; @@ -403,7 +403,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - return memblock_alloc(sizeof(struct resource), 0); + return memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); } diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 82cf950bda2a..46e08e0d9181 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -79,7 +79,7 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, printk("%s: couldn't allocate arena from node %d\n" " falling back to system-wide allocation\n", __func__, nid); - arena = memblock_alloc(sizeof(*arena), 0); + arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); } arena->ptes = memblock_alloc_node(sizeof(*arena), align, nid); @@ -92,7 +92,7 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, #else /* CONFIG_DISCONTIGMEM */ - arena = memblock_alloc(sizeof(*arena), 0); + arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); arena->ptes = memblock_alloc_from(mem_size, align, 0); #endif /* CONFIG_DISCONTIGMEM */ diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 840a4adc69fc..ac7e08886863 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -856,7 +856,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) */ boot_alias_start = phys_to_idmap(start); if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { - res = memblock_alloc(sizeof(*res), 0); + res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; res->end = phys_to_idmap(end); @@ -864,7 +864,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&iomem_resource, res); } - res = memblock_alloc(sizeof(*res), 0); + res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM"; res->start = start; res->end = end; diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index cd5732ab0cdf..083dcd9942ce 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -726,7 +726,7 @@ static int __init _setup_clkctrl_provider(struct device_node *np) u64 size; int i; - provider = memblock_alloc(sizeof(*provider), 0); + provider = memblock_alloc(sizeof(*provider), SMP_CACHE_BYTES); if (!provider) return -ENOMEM; @@ -736,12 +736,14 @@ static int __init _setup_clkctrl_provider(struct device_node *np) of_property_count_elems_of_size(np, "reg", sizeof(u32)) / 2; provider->addr = - memblock_alloc(sizeof(void *) * provider->num_addrs, 0); + memblock_alloc(sizeof(void *) * provider->num_addrs, + SMP_CACHE_BYTES); if (!provider->addr) return -ENOMEM; provider->size = - memblock_alloc(sizeof(u32) * provider->num_addrs, 0); + memblock_alloc(sizeof(u32) * provider->num_addrs, + SMP_CACHE_BYTES); if (!provider->size) return -ENOMEM; diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7ce7306f1d75..953e316521fc 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -218,7 +218,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; standard_resources = memblock_alloc_low(num_standard_resources * sizeof(*standard_resources), - 0); + SMP_CACHE_BYTES); for_each_memblock(memory, region) { res = &standard_resources[i++]; diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 9a6603f8e409..91bd1e129379 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -361,9 +361,9 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; #define IA64_LOG_ALLOCATE(it, size) \ {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \ - (ia64_err_rec_t *)memblock_alloc(size, 0); \ + (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES); \ ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \ - (ia64_err_rec_t *)memblock_alloc(size, 0);} + (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);} #define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock) #define IA64_LOG_LOCK(it) spin_lock_irqsave(&ia64_state_log[it].isl_lock, s) #define IA64_LOG_UNLOCK(it) spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s) diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index ab545daff7c3..9340bcb4f29c 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -59,8 +59,10 @@ struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; void __init mmu_context_init (void) { - ia64_ctx.bitmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3, 0); - ia64_ctx.flushmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3, 0); + ia64_ctx.bitmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3, + SMP_CACHE_BYTES); + ia64_ctx.flushmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3, + SMP_CACHE_BYTES); } /* diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c index 98f55220c67d..8df13d0d96fa 100644 --- a/arch/ia64/sn/kernel/io_common.c +++ b/arch/ia64/sn/kernel/io_common.c @@ -391,7 +391,9 @@ void __init hubdev_init_node(nodepda_t * npda, cnodeid_t node) if (node >= num_online_nodes()) /* Headless/memless IO nodes */ node = 0; - hubdev_info = (struct hubdev_info *)memblock_alloc_node(size, 0, node); + hubdev_info = (struct hubdev_info *)memblock_alloc_node(size, + SMP_CACHE_BYTES, + node); npda->pdinfo = (void *)hubdev_info; } diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 71ad6b0ccab4..a6d40a2c5bff 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -511,7 +511,8 @@ static void __init sn_init_pdas(char **cmdline_p) */ for_each_online_node(cnode) { nodepdaindr[cnode] = - memblock_alloc_node(sizeof(nodepda_t), 0, cnode); + memblock_alloc_node(sizeof(nodepda_t), SMP_CACHE_BYTES, + cnode); memset(nodepdaindr[cnode]->phys_cpuid, -1, sizeof(nodepdaindr[cnode]->phys_cpuid)); spin_lock_init(&nodepdaindr[cnode]->ptc_lock); @@ -522,7 +523,7 @@ static void __init sn_init_pdas(char **cmdline_p) */ for (cnode = num_online_nodes(); cnode < num_cnodes; cnode++) nodepdaindr[cnode] = - memblock_alloc_node(sizeof(nodepda_t), 0, 0); + memblock_alloc_node(sizeof(nodepda_t), SMP_CACHE_BYTES, 0); /* * Now copy the array of nodepda pointers to each nodepda. diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index 8be8b750c629..4d64711d3d47 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -268,7 +268,7 @@ void __init dvma_init(void) list_add(&(hole->list), &hole_list); iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), - 0); + SMP_CACHE_BYTES); dvma_unmap_iommu(DVMA_START, DVMA_SIZE); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 8c14988f52f2..b17fd8aafd64 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -376,7 +376,7 @@ void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) if (mem_init_done) p = kzalloc(size, mask); else { - p = memblock_alloc(size, 0); + p = memblock_alloc(size, SMP_CACHE_BYTES); if (p) memset(p, 0, size); } diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 41c1683761bb..ea09ed6a80a9 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -916,7 +916,7 @@ static void __init resource_init(void) if (end >= HIGHMEM_START) end = HIGHMEM_START - 1; - res = memblock_alloc(sizeof(struct resource), 0); + res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); res->start = start; res->end = end; diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index f331a0054b3a..913bfca09c4f 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -198,7 +198,7 @@ void __init allocate_paca_ptrs(void) paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; - paca_ptrs = __va(memblock_phys_alloc(paca_ptrs_size, 0)); + paca_ptrs = __va(memblock_phys_alloc(paca_ptrs_size, SMP_CACHE_BYTES)); memset(paca_ptrs, 0x88, paca_ptrs_size); } diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 274bd1442dd9..d3f04f2d8249 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -203,7 +203,8 @@ pci_create_OF_bus_map(void) struct property* of_prop; struct device_node *dn; - of_prop = memblock_alloc(sizeof(struct property) + 256, 0); + of_prop = memblock_alloc(sizeof(struct property) + 256, + SMP_CACHE_BYTES); dn = of_find_node_by_path("/"); if (dn) { memset(of_prop, -1, sizeof(struct property) + 256); diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index 5b61704447c1..dedf88a76f58 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -14,7 +14,7 @@ void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) if (slab_is_available()) p = kzalloc(size, mask); else { - p = memblock_alloc(size, 0); + p = memblock_alloc(size, SMP_CACHE_BYTES); } return p; } diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 67b9d7b669a1..2faca46ad720 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -461,10 +461,11 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_alloc(CTX_MAP_SIZE, 0); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), 0); + context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + SMP_CACHE_BYTES); #ifdef CONFIG_SMP - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, 0); + stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index f3391be7c762..ae54d7fe68f3 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -513,7 +513,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr) printk(KERN_ERR "nvram: no address\n"); return -EINVAL; } - nvram_image = memblock_alloc(NVRAM_SIZE, 0); + nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES); nvram_data = ioremap(addr, NVRAM_SIZE*2); nvram_naddrs = 1; /* Make sure we get the correct case */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index aba81cbf0b36..dd807446801e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3769,7 +3769,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb_id = be64_to_cpup(prop64); pr_debug(" PHB-ID : 0x%016llx\n", phb_id); - phb = memblock_alloc(sizeof(*phb), 0); + phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES); /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); @@ -3815,7 +3815,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; - phb->diag_data = memblock_alloc(phb->diag_data_size, 0); + phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES); /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); @@ -3874,7 +3874,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, } pemap_off = size; size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); - aux = memblock_alloc(size, 0); + aux = memblock_alloc(size, SMP_CACHE_BYTES); phb->ioda.pe_alloc = aux; phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 2444feda831f..d45450f6666a 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -128,7 +128,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, if (bmp->bitmap_from_slab) bmp->bitmap = kzalloc(size, GFP_KERNEL); else { - bmp->bitmap = memblock_alloc(size, 0); + bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES); /* the bitmap won't be freed from memblock allocator */ kmemleak_not_leak(bmp->bitmap); } diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 673816880cce..624cb47cc9cd 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -650,7 +650,7 @@ static int __init eth_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), 0); + new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->index = n; diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 2b4dded11a7a..10d8d20eb9ec 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1580,7 +1580,7 @@ static int __init vector_setup(char *str) str, error); return 1; } - new = memblock_alloc(sizeof(*new), 0); + new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->unit = n; new->arguments = str; diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 3678f5b05e42..ce169ea87e61 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -36,7 +36,7 @@ int __init read_initrd(void) return 0; } - area = memblock_alloc(size, 0); + area = memblock_alloc(size, SMP_CACHE_BYTES); if (load_initrd(initrd, area, size) == -1) return 0; diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c index b2c38b32ea57..4b0cb68c355a 100644 --- a/arch/unicore32/kernel/setup.c +++ b/arch/unicore32/kernel/setup.c @@ -206,7 +206,7 @@ request_standard_resources(struct meminfo *mi) if (mi->bank[i].size == 0) continue; - res = memblock_alloc_low(sizeof(*res), 0); + res = memblock_alloc_low(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM"; res->start = mi->bank[i].start; res->end = mi->bank[i].start + mi->bank[i].size - 1; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7f5d212551d4..92c76bf97ad8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -934,7 +934,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) */ #define HPET_RESOURCE_NAME_SIZE 9 hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, - 0); + SMP_CACHE_BYTES); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5fbc57e4b0b9..2953bbf05c08 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2578,7 +2578,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, 0); + mem = memblock_alloc(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 68ff62bffbab..50895c2f937d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1093,7 +1093,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, 0); + res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + SMP_CACHE_BYTES); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 115c8e4173bb..24d2175a9480 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -141,7 +141,7 @@ void * __init prom_early_alloc(unsigned long size) * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. */ - res = memblock_alloc(chunk_size, 0); + res = memblock_alloc(chunk_size, SMP_CACHE_BYTES); BUG_ON(!res); prom_early_allocated += chunk_size; memset(res, 0, chunk_size); diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 190846dddc67..d052712373b6 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -646,7 +646,7 @@ static int __init iss_net_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), 0); + new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); if (new == NULL) { pr_err("Alloc_bootmem failed\n"); return 1; diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index 5b2867a33b98..e205af814582 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -342,7 +342,7 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem) { struct clk_iomap *io; - io = memblock_alloc(sizeof(*io), 0); + io = memblock_alloc(sizeof(*io), SMP_CACHE_BYTES); io->mem = mem; diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index ef618bceb79a..fa2904fb841f 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -15,7 +15,7 @@ static phys_addr_t __init __efi_memmap_alloc_early(unsigned long size) { - return memblock_phys_alloc(size, 0); + return memblock_phys_alloc(size, SMP_CACHE_BYTES); } static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size) diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 2a23453f005a..d168c87c7d30 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -333,7 +333,8 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = memblock_alloc(sizeof(struct firmware_map_entry), 0); + entry = memblock_alloc(sizeof(struct firmware_map_entry), + SMP_CACHE_BYTES); if (WARN_ON(!entry)) return -ENOMEM; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 880a81c82b7a..0a0b8e1f4236 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -492,7 +492,7 @@ int __init smu_init (void) goto fail_np; } - smu = memblock_alloc(sizeof(struct smu_device), 0); + smu = memblock_alloc(sizeof(struct smu_device), SMP_CACHE_BYTES); spin_lock_init(&smu->lock); INIT_LIST_HEAD(&smu->cmd_list); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index d6255c276a41..1977ee0adcb1 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -36,6 +36,7 @@ int __init __weak early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, * panic()s on allocation failure. */ end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; + align = !align ? SMP_CACHE_BYTES : align; base = __memblock_alloc_base(size, align, end); if (!base) return -ENOMEM; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1b4d85879cbe..aee299a6aa76 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -406,7 +406,8 @@ static inline void * __init memblock_alloc_node(phys_addr_t size, static inline void * __init memblock_alloc_node_nopanic(phys_addr_t size, int nid) { - return memblock_alloc_try_nid_nopanic(size, 0, MEMBLOCK_LOW_LIMIT, + return memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, + MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } diff --git a/init/main.c b/init/main.c index 51b8e7b8ae5b..ee147103ba1b 100644 --- a/init/main.c +++ b/init/main.c @@ -375,10 +375,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } static void __init setup_command_line(char *command_line) { saved_command_line = - memblock_alloc(strlen(boot_command_line) + 1, 0); + memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES); initcall_command_line = - memblock_alloc(strlen(boot_command_line) + 1, 0); - static_command_line = memblock_alloc(strlen(command_line) + 1, 0); + memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES); + static_command_line = memblock_alloc(strlen(command_line) + 1, + SMP_CACHE_BYTES); strcpy(saved_command_line, boot_command_line); strcpy(static_command_line, command_line); } @@ -773,8 +774,10 @@ static int __init initcall_blacklist(char *str) str_entry = strsep(&str, ","); if (str_entry) { pr_debug("blacklisting initcall %s\n", str_entry); - entry = memblock_alloc(sizeof(*entry), 0); - entry->buf = memblock_alloc(strlen(str_entry) + 1, 0); + entry = memblock_alloc(sizeof(*entry), + SMP_CACHE_BYTES); + entry->buf = memblock_alloc(strlen(str_entry) + 1, + SMP_CACHE_BYTES); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3c9e365438ad..b0308a2c6000 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -963,7 +963,8 @@ void __init __register_nosave_region(unsigned long start_pfn, BUG_ON(!region); } else { /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), 0); + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; diff --git a/lib/cpumask.c b/lib/cpumask.c index 75b5e7672c4c..8d666ab84b5c 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -163,7 +163,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_alloc(cpumask_size(), 0); + *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); } /** diff --git a/mm/memblock.c b/mm/memblock.c index c655342569f8..839531133816 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1247,9 +1247,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, { phys_addr_t found; - if (!align) - align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, start, end, nid, flags); if (found && !memblock_reserve(found, size)) { @@ -1343,8 +1340,6 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * The allocation is performed from memory region limited by * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. * - * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. - * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. * @@ -1374,9 +1369,6 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) - align = SMP_CACHE_BYTES; - if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; again: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef289fadec0e..a919ba5cb3c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7710,9 +7710,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_alloc_nopanic(size, 0); + table = memblock_alloc_nopanic(size, + SMP_CACHE_BYTES); else - table = memblock_alloc_raw(size, 0); + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); } else { diff --git a/mm/percpu.c b/mm/percpu.c index 61cdbb3b3736..a6b74c6fe0be 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1102,8 +1102,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ chunk = memblock_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - 0); + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1114,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0]), 0); - chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0]), 0); + chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), + SMP_CACHE_BYTES); + chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), + SMP_CACHE_BYTES); + chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), + SMP_CACHE_BYTES); pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -2075,12 +2075,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_alloc(ai->nr_groups * - sizeof(group_offsets[0]), 0); - group_sizes = memblock_alloc(ai->nr_groups * - sizeof(group_sizes[0]), 0); - unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); - unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), + SMP_CACHE_BYTES); + group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), + SMP_CACHE_BYTES); + unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), + SMP_CACHE_BYTES); + unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), + SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2144,8 +2146,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_alloc( - pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); + pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), + SMP_CACHE_BYTES); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2458,7 +2460,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_alloc_nopanic(areas_size, 0); + areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2599,7 +2601,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, 0); + pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; diff --git a/mm/sparse.c b/mm/sparse.c index ab2ac45e0440..33307fc05c4d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -68,7 +68,8 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) if (slab_is_available()) section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = memblock_alloc_node(array_size, 0, nid); + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); return section; } -- cgit v1.2.3 From 0962590e553331db2cc0aef2dc35c57f6300dbbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:52 +0100 Subject: bpf: fix partial copy of map_ptr when dst is scalar ALU operations on pointers such as scalar_reg += map_value_ptr are handled in adjust_ptr_min_max_vals(). Problem is however that map_ptr and range in the register state share a union, so transferring state through dst_reg->range = ptr_reg->range is just buggy as any new map_ptr in the dst_reg is then truncated (or null) for subsequent checks. Fix this by adding a raw member and use it for copying state over to dst_reg. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Daniel Borkmann Cc: Edward Cree Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e8056ec20fa..d93e89761a8b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -51,6 +51,9 @@ struct bpf_reg_state { * PTR_TO_MAP_VALUE_OR_NULL */ struct bpf_map *map_ptr; + + /* Max size from any of the above. */ + unsigned long raw; }; /* Fixed part of pointer offset, pointer types only */ s32 off; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 171a2c88e77d..774fa40a32ae 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3046,7 +3046,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->off = ptr_reg->off + smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. Note that off_reg->off @@ -3076,10 +3076,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_SUB: @@ -3108,7 +3109,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; dst_reg->off = ptr_reg->off - smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. If the subtrahend is known @@ -3134,11 +3135,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_AND: -- cgit v1.2.3 From 4d31f30148cea6e97e42616231eed55295117fe7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:53 +0100 Subject: bpf: don't set id on after map lookup with ptr_to_map_val return In the verifier there is no such semantics where registers with PTR_TO_MAP_VALUE type have an id assigned to them. This is only used in PTR_TO_MAP_VALUE_OR_NULL and later on nullified once the test against NULL has been pattern matched and type transformed into PTR_TO_MAP_VALUE. Fixes: 3e6a4b3e0289 ("bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE") Signed-off-by: Daniel Borkmann Cc: Roman Gushchin Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 774fa40a32ae..1971ca325fb4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2852,10 +2852,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -2868,7 +2864,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].id = ++env->id_gen; + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + } else { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { int id = acquire_reference_state(env, insn_idx); if (id < 0) -- cgit v1.2.3 From 57f01796f14fecf00d330fe39c8d2477ced9cd79 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 1 Nov 2018 00:35:05 +0000 Subject: irq/matrix: Fix memory overallocation IRQ_MATRIX_SIZE is the number of longs needed for a bitmap, multiplied by the size of a long, yielding a byte count. But it is used to size an array of longs, which is way more memory than is needed. Change IRQ_MATRIX_SIZE so it is just the number of longs needed and the arrays come out the correct size. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Cc: KY Srinivasan Link: https://lkml.kernel.org/r/1541032428-10392-1-git-send-email-mikelley@microsoft.com --- kernel/irq/matrix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 6e6d467f3dec..1f0985adf193 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -8,7 +8,7 @@ #include #include -#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long)) +#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS)) struct cpumap { unsigned int available; -- cgit v1.2.3 From 98f76206b33504b934209d16196477dfa519a807 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 12 Oct 2018 14:42:53 +0100 Subject: compat: Cleanup in_compat_syscall() callers Now that in_compat_syscall() is consistent on all architectures and does not longer report true on native i686, the workarounds (ifdeffery and helpers) can be removed. Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Ard Biesheuvel Cc: Andy Lutomirsky Cc: "David S. Miller" Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: John Stultz Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Steffen Klassert Cc: Stephen Boyd Cc: Steven Rostedt Cc: linux-efi@vger.kernel.org Cc: netdev@vger.kernel.org Link: https://lkml.kernel.org/r/20181012134253.23266-3-dima@arista.com --- drivers/firmware/efi/efivars.c | 16 ++++------------ kernel/time/time.c | 2 +- net/xfrm/xfrm_state.c | 2 -- net/xfrm/xfrm_user.c | 2 -- 4 files changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 3e626fd9bd4e..8061667a6765 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -229,14 +229,6 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor, return 0; } -static inline bool is_compat(void) -{ - if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall()) - return true; - - return false; -} - static void copy_out_compat(struct efi_variable *dst, struct compat_efi_variable *src) { @@ -263,7 +255,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count) u8 *data; int err; - if (is_compat()) { + if (in_compat_syscall()) { struct compat_efi_variable *compat; if (count != sizeof(*compat)) @@ -324,7 +316,7 @@ efivar_show_raw(struct efivar_entry *entry, char *buf) &entry->var.DataSize, entry->var.Data)) return -EIO; - if (is_compat()) { + if (in_compat_syscall()) { compat = (struct compat_efi_variable *)buf; size = sizeof(*compat); @@ -418,7 +410,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, struct compat_efi_variable *compat = (struct compat_efi_variable *)buf; struct efi_variable *new_var = (struct efi_variable *)buf; struct efivar_entry *new_entry; - bool need_compat = is_compat(); + bool need_compat = in_compat_syscall(); efi_char16_t *name; unsigned long size; u32 attributes; @@ -495,7 +487,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (is_compat()) { + if (in_compat_syscall()) { if (count != sizeof(*compat)) return -EINVAL; diff --git a/kernel/time/time.c b/kernel/time/time.c index e3a7f7fd3abc..ad204cf6d001 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -842,7 +842,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b669262682c9..dc4a9f1fb941 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2077,10 +2077,8 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif if (!optval && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ca7a207b81a9..c9a84e22f5d5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2621,10 +2621,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, const struct xfrm_link *link; int type, err; -#ifdef CONFIG_COMPAT if (in_compat_syscall()) return -EOPNOTSUPP; -#endif type = nlh->nlmsg_type; if (type > XFRM_MSG_MAX) -- cgit v1.2.3 From b5f2954d30c77649bce9c27e7a0a94299d9cfdf8 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 1 Nov 2018 17:24:10 -0400 Subject: blkcg: revert blkcg cleanups series This reverts a series committed earlier due to null pointer exception bug report in [1]. It seems there are edge case interactions that I did not consider and will need some time to understand what causes the adverse interactions. The original series can be found in [2] with a follow up series in [3]. [1] https://www.spinics.net/lists/cgroups/msg20719.html [2] https://lore.kernel.org/lkml/20180911184137.35897-1-dennisszhou@gmail.com/ [3] https://lore.kernel.org/lkml/20181020185612.51587-1-dennis@kernel.org/ This reverts the following commits: d459d853c2ed, b2c3fa546705, 101246ec02b5, b3b9f24f5fcc, e2b0989954ae, f0fcb3ec89f3, c839e7a03f92, bdc2491708c4, 74b7c02a9bc1, 5bf9a1f3b4ef, a7b39b4e961c, 07b05bcc3213, 49f4c2dc2b50, 27e6fa996c53 Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 8 +- block/bfq-cgroup.c | 4 +- block/bfq-iosched.c | 2 +- block/bio.c | 174 +++++++++----------------------- block/blk-cgroup.c | 123 +++++++--------------- block/blk-core.c | 1 - block/blk-iolatency.c | 26 ++++- block/blk-throttle.c | 13 ++- block/bounce.c | 4 +- block/cfq-iosched.c | 4 +- drivers/block/loop.c | 5 +- drivers/md/raid0.c | 2 +- fs/buffer.c | 10 +- fs/ext4/page-io.c | 2 +- include/linux/bio.h | 26 ++--- include/linux/blk-cgroup.h | 145 +++++++++----------------- include/linux/blk_types.h | 1 + include/linux/cgroup.h | 2 - include/linux/writeback.h | 5 +- kernel/cgroup/cgroup.c | 48 ++------- kernel/trace/blktrace.c | 4 +- mm/page_io.c | 2 +- 22 files changed, 208 insertions(+), 403 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..184193bcb262 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1857,10 +1857,8 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup and the - corresponding request queue. This must be called after - a queue (device) has been associated with the bio and - before submission. + associates the bio with the inode's owner cgroup. Can be + called anytime between bio allocation and submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1879,7 +1877,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_create_blkg() +cases by skipping wbc_init_bio() or using bio_associate_blkcg() directly. diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d9a7916ff0ab..9fe5952d117d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6075100f03a5..3a27d31fcda6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4384,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; diff --git a/block/bio.c b/block/bio.c index bbfeb4ee2892..4a5a036268fb 100644 --- a/block/bio.c +++ b/block/bio.c @@ -609,9 +609,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -1956,151 +1954,69 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - bio->bi_blkg = blkg_tryget_closest(blkg); - return 0; -} - -/** - * __bio_associate_blkg_from_css - internal blkg association function - * - * This in the core association function that all association paths rely on. - * A blkg reference is taken which is released upon freeing of the bio. - */ -static int __bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - int ret; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - ret = bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); - return ret; -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - return __bio_associate_blkg_from_css(bio, css); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - #ifdef CONFIG_MEMCG /** - * bio_associate_blkg_from_page - associate a bio with the page's blkg + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg * @bio: target bio * @page: the page to lookup the blkcg from * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns NULL, fall back to the queue's - * root_blkg. - * - * Note: this must be called after bio has an associated device. + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. */ -int bio_associate_blkg_from_page(struct bio *bio, struct page *page) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) { - struct cgroup_subsys_state *css; - int ret; + struct cgroup_subsys_state *blkcg_css; - if (unlikely(bio->bi_blkg)) + if (unlikely(bio->bi_css)) return -EBUSY; if (!page->mem_cgroup) return 0; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; } #endif /* CONFIG_MEMCG */ /** - * bio_associate_create_blkg - associate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. * - * Associate @bio with the blkg found from the bio's css and the request_queue. - * If one is not found, bio_lookup_blkg creates the blkg. This falls back to - * the queue's root_blkg if association fails. + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. */ -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { - struct cgroup_subsys_state *css; - int ret = 0; - - /* someone has already associated this bio with a blkg */ - if (bio->bi_blkg) - return ret; - - rcu_read_lock(); - - css = blkcg_css(); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; } +EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** - * bio_reassociate_blkg - reassociate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkg - associate a bio with the specified blkg * @bio: target bio + * @blkg: the blkg to associate * - * When submitting a bio, multiple recursive calls to make_request() may occur. - * This causes the initial associate done in blkcg_bio_issue_check() to be - * incorrect and reference the prior request_queue. This performs reassociation - * when this situation happens. + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. */ -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } - - return bio_associate_create_blkg(q, bio); + if (unlikely(bio->bi_blkg)) + return -EBUSY; + if (!blkg_try_get(blkg)) + return -ENODEV; + bio->bi_blkg = blkg; + return 0; } /** @@ -2113,6 +2029,10 @@ void bio_disassociate_task(struct bio *bio) put_io_context(bio->bi_ioc); bio->bi_ioc = NULL; } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; @@ -2120,16 +2040,16 @@ void bio_disassociate_task(struct bio *bio) } /** - * bio_clone_blkg_association - clone blkg association from src to dst bio + * bio_clone_blkcg_association - clone blkcg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) - bio_associate_blkg(dst, src->bi_blkg); + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); } -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 992da5592c6e..c630e02836a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -84,37 +84,6 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(blkg); } -static void __blkg_release(struct rcu_head *rcu) -{ - struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); - - percpu_ref_exit(&blkg->refcnt); - - /* release the blkcg and parent blkg refs this blkg has been holding */ - css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - - blkg_free(blkg); -} - -/* - * A group is RCU protected, but having an rcu lock does not mean that one - * can access all the fields of blkg and assume these are valid. For - * example, don't try to follow throtl_data and request queue links. - * - * Having a reference to blkg under an rcu allows accesses to only values - * local to groups like group stats and group rate limits. - */ -static void blkg_release(struct percpu_ref *ref) -{ - struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); - - call_rcu(&blkg->rcu_head, __blkg_release); -} - /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -141,6 +110,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -247,11 +217,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } - ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, - GFP_NOWAIT | __GFP_NOWARN); - if (ret) - goto err_cancel_ref; - /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -284,8 +249,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_cancel_ref: - percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -296,7 +259,7 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -305,11 +268,12 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns the blkg or the closest blkg if blkg_create fails as it walks - * down from root. + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; @@ -321,7 +285,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return q->root_blkg; + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -329,58 +293,23 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. Returns the closest - * blkg to the intended blkg should blkg_create() fail. + * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - struct blkcg_gq *ret_blkg = q->root_blkg; - - while (parent) { - blkg = __blkg_lookup(parent, q, false); - if (blkg) { - /* remember closest blkg */ - ret_blkg = blkg; - break; - } + + while (parent && !__blkg_lookup(parent, q, false)) { pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; - if (pos == blkcg) + if (pos == blkcg || IS_ERR(blkg)) return blkg; } } -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - unsigned long flags; - - if (unlikely(!blkg)) { - spin_lock_irqsave(q->queue_lock, flags); - - blkg = __blkg_lookup_create(blkcg, q); - - spin_unlock_irqrestore(q->queue_lock, flags); - } - - return blkg; -} - static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; @@ -424,7 +353,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - percpu_ref_kill(&blkg->refcnt); + blkg_put(blkg); } /** @@ -451,6 +380,29 @@ static void blkg_destroy_all(struct request_queue *q) q->root_rl.blkg = NULL; } +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +void __blkg_release_rcu(struct rcu_head *rcu_head) +{ + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} +EXPORT_SYMBOL_GPL(__blkg_release_rcu); + /* * The next function used by blk_queue_for_each_rl(). It's a bit tricky * because the root blkg uses @q->root_rl instead of its own rl. @@ -1796,7 +1748,8 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - if (!blkg_tryget(blkg)) + blkg = blkg_try_get(blkg); + if (!blkg) goto out; rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 26a5dac80ed9..ce12515f9b9b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2435,7 +2435,6 @@ blk_qc_t generic_make_request(struct bio *bio) if (q) blk_queue_exit(q); q = bio->bi_disk->queue; - bio_reassociate_blkg(q, bio); flags = 0; if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..bb240a0c1309 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -480,12 +480,34 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg_gq *blkg = bio->bi_blkg; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + struct request_queue *q = rqos->q; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; + rcu_read_lock(); + blkcg = bio_blkcg(bio); + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + if (!lock) + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + if (!lock) + spin_unlock_irq(q->queue_lock); + } + if (!blkg) + goto out; + + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio_associate_blkg(bio, blkg); +out: + rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -706,7 +728,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_tryget(blkg)) + if (!blkg_try_get(blkg)) continue; iolat = blkg_to_lat(blkg); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4bda70e8db48..db1a3a2ae006 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2115,11 +2115,21 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) +{ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + /* fallback to root_blkg if we fail to get a blkg ref */ + if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) + bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +#endif +} + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; @@ -2138,6 +2148,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (unlikely(blk_queue_bypass(q))) goto out_unlock; + blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; diff --git a/block/bounce.c b/block/bounce.c index ec0d99995f5f..418677dcec60 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -276,9 +276,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); return bio; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6a3d87dd3c1a..ed41aa978c4a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3759,7 +3759,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* @@ -3824,7 +3824,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index abad6d15f956..ea9debf59b22 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,7 +77,6 @@ #include #include #include -#include #include "loop.h" @@ -1761,8 +1760,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; + if (cmd->use_aio && rq->bio && rq->bio->bi_css) { + cmd->css = rq->bio->bi_css; css_get(cmd->css); } else #endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f3fb5bb8c82a..ac1cffd2a09b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); + bio_clone_blkcg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/fs/buffer.c b/fs/buffer.c index 109f55196866..6f1ae3ac9789 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,6 +3060,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3079,11 +3084,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - submit_bio(bio); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2aa62d58d8dd..db7590178dfc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; - wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/include/linux/bio.h b/include/linux/bio.h index b47c7f716731..056fb627edb3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -503,31 +503,23 @@ do { \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkg_from_page(struct bio *bio, struct page *page); +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css); -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkg_association(struct bio *dst, struct bio *src); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ return 0; } -static inline int bio_associate_create_blkg(struct request_queue *q, - struct bio *bio) { return 0; } -static inline int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) -{ return 0; } +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1e76ceebeb5d..6d766a19f2bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -126,7 +126,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - struct percpu_ref refcnt; + atomic_t refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -184,8 +184,6 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -232,59 +230,22 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -/** - * __bio_blkcg - internal version of bio_blkcg for bfq and cfq - * - * DO NOT USE. - * There is a flaw using this version of the function. In particular, this was - * used in a broken paradigm where association was called on the given css. It - * is possible though that the returned css from task_css() is in the process - * of dying due to migration of the current task. So it is improper to assume - * *_get() is going to succeed. Both BFQ and CFQ rely on this logic and will - * take additional work to handle more gracefully. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - -/** - * bio_blkcg - grab the blkcg associated with a bio - * @bio: target bio - * - * This returns the blkcg associated with a bio, NULL if not associated. - * Callers are expected to either handle NULL or know association has been - * done prior to calling this. - */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return NULL; + struct cgroup_subsys_state *css; + + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } static inline bool blk_cgroup_congested(void) @@ -490,35 +451,26 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - percpu_ref_get(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); } /** - * blkg_tryget - try and get a blkg reference + * blkg_try_get - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline bool blkg_tryget(struct blkcg_gq *blkg) +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) { - return percpu_ref_tryget(&blkg->refcnt); + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; } -/** - * blkg_tryget_closest - try and get a blkg ref on the closet blkg - * @blkg: blkg to get - * - * This walks up the blkg tree to find the closest non-dying blkg and returns - * the blkg that it did association with as it may not be the passed in blkg. - */ -static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) -{ - while (!percpu_ref_tryget(&blkg->refcnt)) - blkg = blkg->parent; - return blkg; -} +void __blkg_release_rcu(struct rcu_head *rcu); /** * blkg_put - put a blkg reference @@ -526,7 +478,9 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) */ static inline void blkg_put(struct blkcg_gq *blkg) { - percpu_ref_put(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); } /** @@ -579,36 +533,25 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); - if (bio && bio->bi_blkg) { - blkcg = bio->bi_blkg->blkcg; - if (blkcg == &blkcg_root) - goto rl_use_root; - - blkg_get(bio->bi_blkg); - rcu_read_unlock(); - return &bio->bi_blkg->rl; - } + blkcg = bio_blkcg(bio); - blkcg = css_to_blkcg(blkcg_css()); + /* bypass blkg lookup and use @q->root_rl directly for root */ if (blkcg == &blkcg_root) - goto rl_use_root; + goto root_rl; + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) - blkg = __blkg_lookup_create(blkcg, q); - - if (blkg->blkcg == &blkcg_root || !blkg_tryget(blkg)) - goto rl_use_root; + goto root_rl; + blkg_get(blkg); rcu_read_unlock(); return &blkg->rl; - - /* - * Each blkg has its own request_list, however, the root blkcg - * uses the request_queue's root_rl. This is to avoid most - * overhead for the root blkcg. - */ -rl_use_root: +root_rl: rcu_read_unlock(); return &q->root_rl; } @@ -854,26 +797,32 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif - -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { + struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); + blkcg = bio_blkcg(bio); + + /* associate blkcg if bio hasn't attached one */ + bio_associate_blkcg(bio, &blkcg->css); - bio_associate_create_blkg(q, bio); - blkg = bio->bi_blkg; + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the @@ -885,8 +834,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } - blkcg_bio_issue_init(bio); - rcu_read_unlock(); return !throtl; } @@ -983,7 +930,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, @@ -999,7 +945,6 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 093a818c5b68..1dcf652ba0aa 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,6 +178,7 @@ struct bio { * release. Read comment on top of bio_associate_current(). */ struct io_context *bi_ioc; + struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b8bcbdeb2eac..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,8 +93,6 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..fdfd04e348f6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,8 +246,7 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. Must be called after the bio has been associated with - * a device. + * writeback context. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -258,7 +257,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); + bio_associate_blkcg(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..4a3dae2a8283 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -522,35 +522,6 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } -/** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get the effective css of @cgrp for @ss. The effective css is - * defined as the matching css of the nearest ancestor including self which - * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, - * the root css is returned, so this function always returns a valid css. - * - * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. - */ -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - do { - css = cgroup_css(cgrp, ss); - - if (css) - return css; - cgrp = cgroup_parent(cgrp); - } while (cgrp); - - return init_css_set.subsys[ss->id]; -} - /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -633,11 +604,10 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1036,7 +1006,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css_by_mask(cgrp, ss); + template[i] = cgroup_e_css(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3053,7 +3023,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css_by_mask() results reflect the new csses + * At this point, cgroup_e_css() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_blkg) + if (!bio->bi_css) return NULL; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_get_kernfs_id(bio->bi_css->cgroup); } #else static union kernfs_node_id * diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio_associate_blkg_from_page(bio, page); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3 From df0734702a7cbba49d6765bd5ba069340bf9c5db Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:15 -0700 Subject: bpf: show real jited prog address in /proc/kallsyms Currently, /proc/kallsyms shows page address of jited bpf program. The main reason here is to not expose randomized start address. However, This is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because these addresses are still protected by sysctl kptr_restrict (see kallsyms_show_value()), and only programs loaded by root are added to kallsyms (see bpf_prog_kallsyms_add()). Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6377225b2082..1a796e0799ec 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -553,7 +553,6 @@ bool is_bpf_text_address(unsigned long addr) int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog_aux *aux; unsigned int it = 0; int ret = -ERANGE; @@ -566,10 +565,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); bpf_get_prog_name(aux->prog, sym); - *value = symbol_start; + *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; ret = 0; -- cgit v1.2.3 From de57e99ceb65d0d7775cc14a8ba5931d7de1d708 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:16 -0700 Subject: bpf: show real jited address in bpf_prog_info->jited_ksyms Currently, jited_ksyms in bpf_prog_info shows page addresses of jited bpf program. The main reason here is to not expose randomized start address. However, this is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because bpf_prog_get_info_by_fd() is only available to root. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ccb93277aae2..34a9eef5992c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2172,7 +2172,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, user_ksyms = u64_to_user_ptr(info.jited_ksyms); for (i = 0; i < ulen; i++) { ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } -- cgit v1.2.3 From ff1889fc531f582f902175c0acc80321af540b24 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:17 -0700 Subject: bpf: show main program address and length in bpf_prog_info Currently, when there is no subprog (prog->aux->func_cnt == 0), bpf_prog_info does not return any jited_ksyms or jited_func_lens. This patch adds main program address (prog->bpf_func) and main program length (prog->jited_len) to bpf_prog_info. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 34a9eef5992c..9418174c276c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2158,11 +2158,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; + info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (info.nr_jited_ksyms && ulen) { if (bpf_dump_raw_ok()) { + unsigned long ksym_addr; u64 __user *user_ksyms; - ulong ksym_addr; u32 i; /* copy the address of the kernel symbol @@ -2170,9 +2170,17 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - if (put_user((u64) ksym_addr, &user_ksyms[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + ksym_addr = (unsigned long) + prog->aux->func[i]->bpf_func; + if (put_user((u64) ksym_addr, + &user_ksyms[i])) + return -EFAULT; + } + } else { + ksym_addr = (unsigned long) prog->bpf_func; + if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { @@ -2181,7 +2189,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; + info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (info.nr_jited_func_lens && ulen) { if (bpf_dump_raw_ok()) { u32 __user *user_lens; @@ -2190,9 +2198,16 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + func_len = + prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + func_len = prog->jited_len; + if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { -- cgit v1.2.3 From 28c2fae726bf5003cd209b0d5910a642af98316f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Nov 2018 11:35:46 +0100 Subject: bpf: fix bpf_prog_get_info_by_fd to return 0 func_lens for unpriv While dbecd7388476 ("bpf: get kernel symbol addresses via syscall") zeroed info.nr_jited_ksyms in bpf_prog_get_info_by_fd() for queries from unprivileged users, commit 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") forgot about doing so and therefore returns the #elems of the user set up buffer which is incorrect. It also needs to indicate a info.nr_jited_func_lens of zero. Fixes: 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") Signed-off-by: Daniel Borkmann Cc: Sandipan Das Cc: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9418174c276c..cf5040fd5434 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2078,6 +2078,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; + info.nr_jited_func_lens = 0; goto done; } -- cgit v1.2.3 From 3383b36040522505546cb112f0a543a5998edfb6 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 2 Nov 2018 15:48:35 -0700 Subject: kernel/kexec_file.c: remove some duplicated includes We include kexec.h and slab.h twice in kexec_file.c. It's unnecessary. hence just remove them. Link: http://lkml.kernel.org/r/1537498098-19171-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Bhupesh Sharma Reviewed-by: Andrew Morton Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c6a3b6851372..35cf0ad29718 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -25,8 +25,6 @@ #include #include #include -#include -#include #include #include #include "kexec_internal.h" -- cgit v1.2.3 From 6f0483d1f91b612186abeaebf3ce43bf805eb9f7 Mon Sep 17 00:00:00 2001 From: Michael Schupikov Date: Fri, 2 Nov 2018 15:48:38 -0700 Subject: kernel/sysctl.c: remove duplicated include Remove one include of . No functional changes. Link: http://lkml.kernel.org/r/20181004134223.17735-1-michael@schupikov.de Signed-off-by: Michael Schupikov Reviewed-by: Richard Weinberger Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3ae223f7b5df..5fc724e4e454 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,7 +66,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From f75d651587f719a813ebbbfeee570e6570731d55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 4 Nov 2018 18:40:14 -0800 Subject: resource/docs: Fix new kernel-doc warnings The first group of warnings is caused by a "/**" kernel-doc notation marker but the function comments are not in kernel-doc format. Also add another error return value here. ../kernel/resource.c:337: warning: Function parameter or member 'start' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'end' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'flags' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'desc' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'first_lvl' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'res' not described in 'find_next_iomem_res' Add the missing function parameter documentation for the other warnings: ../kernel/resource.c:409: warning: Function parameter or member 'arg' not described in 'walk_iomem_res_desc' ../kernel/resource.c:409: warning: Function parameter or member 'func' not described in 'walk_iomem_res_desc' Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/dda2e4d8-bedd-3167-20fe-8c7d2d35b354@infradead.org Signed-off-by: Ingo Molnar --- kernel/resource.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index b3a3a1fc499e..17bcb189d530 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,14 +318,14 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/** +/* * Finds the lowest iomem resource that covers part of [start..end]. The * caller must specify start, end, flags, and desc (which may be * IORES_DESC_NONE). * * If a resource is found, returns 0 and *res is overwritten with the part * of the resource that's within [start..end]; if none is found, returns - * -1. + * -1. Returns -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. @@ -390,7 +390,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * Walks through iomem resources and calls func() with matching resource + * walk_iomem_res_desc - walk through iomem resources + * + * Walks through iomem resources and calls @func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. @@ -399,6 +401,8 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @flags: I/O resource flags * @start: start addr * @end: end addr + * @arg: function argument for the callback @func + * @func: callback function that is called for each qualifying resource area * * NOTE: For a new descriptor search, define a new IORES_DESC in * and set it in 'desc' of a target resource entry. -- cgit v1.2.3 From ee474b81fe5aa5dc0faae920bf66240fbf55f891 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 1 Nov 2018 23:29:28 +0900 Subject: tracing/kprobes: Fix strpbrk() argument order Fix strpbrk()'s argument order, it must pass acceptable string in 2nd argument. Note that this can cause a kernel panic where it recovers backup character to code->data. Link: http://lkml.kernel.org/r/154108256792.2604.1816052586385217811.stgit@devbox Fixes: a6682814f371 ("tracing/kprobes: Allow kprobe-events to record module symbol") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ef15a6683c0..bd30e9398d2a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -535,7 +535,7 @@ int traceprobe_update_arg(struct probe_arg *arg) if (code[1].op != FETCH_OP_IMM) return -EINVAL; - tmp = strpbrk("+-", code->data); + tmp = strpbrk(code->data, "+-"); if (tmp) c = *tmp; ret = traceprobe_split_symbol_offset(code->data, -- cgit v1.2.3 From f26621e60b35369bca9228bc936dc723b3e421af Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 5 Nov 2018 10:33:07 +0100 Subject: resource/docs: Complete kernel-doc style function documentation Add the missing kernel-doc style function parameters documentation. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: linux-tip-commits@vger.kernel.org Cc: rdunlap@infradead.org Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/20181105093307.GA12445@zn.tnic Signed-off-by: Ingo Molnar --- kernel/resource.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 17bcb189d530..b0fbf685c77a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,17 +318,24 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/* - * Finds the lowest iomem resource that covers part of [start..end]. The - * caller must specify start, end, flags, and desc (which may be +/** + * Finds the lowest iomem resource that covers part of [@start..@end]. The + * caller must specify @start, @end, @flags, and @desc (which may be * IORES_DESC_NONE). * - * If a resource is found, returns 0 and *res is overwritten with the part - * of the resource that's within [start..end]; if none is found, returns - * -1. Returns -EINVAL for other invalid parameters. + * If a resource is found, returns 0 and @*res is overwritten with the part + * of the resource that's within [@start..@end]; if none is found, returns + * -1 or -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. + * + * @start: start address of the resource searched for + * @end: end address of same resource + * @flags: flags which the resource must have + * @desc: descriptor the resource must have + * @first_lvl: walk only the first level children, if set + * @res: return ptr, if resource found */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, @@ -390,9 +397,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * walk_iomem_res_desc - walk through iomem resources - * - * Walks through iomem resources and calls @func() with matching resource + * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. -- cgit v1.2.3 From d2f007dbe7e4c9583eea6eb04d60001e85c6f1bd Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 5 Nov 2018 20:55:09 +0100 Subject: userns: also map extents in the reverse map to kernel IDs The current logic first clones the extent array and sorts both copies, then maps the lower IDs of the forward mapping into the lower namespace, but doesn't map the lower IDs of the reverse mapping. This means that code in a nested user namespace with >5 extents will see incorrect IDs. It also breaks some access checks, like inode_owner_or_capable() and privileged_wrt_inode_uidgid(), so a process can incorrectly appear to be capable relative to an inode. To fix it, we have to make sure that the "lower_first" members of extents in both arrays are translated; and we have to make sure that the reverse map is sorted *after* the translation (since otherwise the translation can break the sorting). This is CVE-2018-18955. Fixes: 6397fac4915a ("userns: bump idmap limits to 340") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Tested-by: Eric W. Biederman Reviewed-by: Eric W. Biederman Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e5222b5fb4fe..923414a246e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; - ret = sort_idmaps(&new_map); - if (ret < 0) - goto out; - ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. @@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf, e->lower_first = lower_first; } + /* + * If we want to use binary search for lookup, this clones the extent + * array and sorts both copies. + */ + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, -- cgit v1.2.3 From e6a2d72c10405b30ddba5af2e44a9d3d925a56d3 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 7 Nov 2018 12:10:32 +0100 Subject: posix-cpu-timers: Remove useless call to check_dl_overrun() check_dl_overrun() is used to send a SIGXCPU to users that asked to be informed when a SCHED_DEADLINE runtime overruns occur. The function is called by check_thread_timers() already, so the call in check_process_timers() is redundant/wrong (even though harmless). Remove it. Fixes: 34be39305a77 ("sched/deadline: Implement "runtime overrun signal" support") Signed-off-by: Juri Lelli Signed-off-by: Thomas Gleixner Reviewed-by: Daniel Bristot de Oliveira Reviewed-by: Steven Rostedt (VMware) Cc: linux-rt-users@vger.kernel.org Cc: mtk.manpages@gmail.com Cc: Mathieu Poirier Cc: Peter Zijlstra Cc: Luca Abeni Cc: Claudio Scordino Link: https://lkml.kernel.org/r/20181107111032.32291-1-juri.lelli@redhat.com --- kernel/time/posix-cpu-timers.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ce32cf741b25..8f0644af40be 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -917,9 +917,6 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; - if (dl_task(tsk)) - check_dl_overrun(tsk); - /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). -- cgit v1.2.3 From dded2e159208a9edc21dd5c5f583afa28d378d39 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 17:17:49 +0000 Subject: kdb: use correct pointer when 'btc' calls 'btt' On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0x0 when booting the kernel with 'debug_boot_weak_hash', it fails as well Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0xba99ad80 On other platforms, Oopses have been observed too, see https://github.com/linuxppc/linux/issues/139 This is due to btc calling 'btt' with %p pointer as an argument. This patch replaces %p by %px to get the real pointer value as expected by 'btt' Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Signed-off-by: Christophe Leroy Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 6ad4a9fcbd6f..7921ae4fca8d 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv) kdb_printf("no process for cpu %ld\n", cpu); return 0; } - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); return 0; } kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); touch_nmi_watchdog(); } -- cgit v1.2.3 From 568fb6f42ac6851320adaea25f8f1b94de14e40a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 17:17:57 +0000 Subject: kdb: print real address of pointers instead of hashed addresses Since commit ad67b74d2469 ("printk: hash addresses printed with %p"), all pointers printed with %p are printed with hashed addresses instead of real addresses in order to avoid leaking addresses in dmesg and syslog. But this applies to kdb too, with is unfortunate: Entering kdb (current=0x(ptrval), pid 329) due to Keyboard Entry kdb> ps 15 sleeping system daemon (state M) processes suppressed, use 'ps A' to see all. Task Addr Pid Parent [*] cpu State Thread Command 0x(ptrval) 329 328 1 0 R 0x(ptrval) *sh 0x(ptrval) 1 0 0 0 S 0x(ptrval) init 0x(ptrval) 3 2 0 0 D 0x(ptrval) rcu_gp 0x(ptrval) 4 2 0 0 D 0x(ptrval) rcu_par_gp 0x(ptrval) 5 2 0 0 D 0x(ptrval) kworker/0:0 0x(ptrval) 6 2 0 0 D 0x(ptrval) kworker/0:0H 0x(ptrval) 7 2 0 0 D 0x(ptrval) kworker/u2:0 0x(ptrval) 8 2 0 0 D 0x(ptrval) mm_percpu_wq 0x(ptrval) 10 2 0 0 D 0x(ptrval) rcu_preempt The whole purpose of kdb is to debug, and for debugging real addresses need to be known. In addition, data displayed by kdb doesn't go into dmesg. This patch replaces all %p by %px in kdb in order to display real addresses. Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Signed-off-by: Christophe Leroy Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 14 +++++++------- kernel/debug/kdb/kdb_support.c | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bb4fe4e1a601..959242084b40 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, if (reason == KDB_REASON_DEBUG) { /* special case below */ } else { - kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_printf("\nEntering kdb (current=0x%px, pid %d) ", kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ switch (db_result) { case KDB_DB_BPT: - kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_printf("\nEntering kdb (0x%px, pid %d) ", kdb_current, kdb_current->pid); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -2048,7 +2048,7 @@ static int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u 0x%p ", mod->name, + kdb_printf("%-20s%8u 0x%px ", mod->name, mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -2059,7 +2059,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->core_layout.base); + kdb_printf(" 0x%px", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { @@ -2341,7 +2341,7 @@ void kdb_ps1(const struct task_struct *p) return; cpu = kdb_process_cpu(p); - kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n", (void *)p, p->pid, p->parent->pid, kdb_task_has_cpu(p), kdb_process_cpu(p), kdb_task_state_char(p), @@ -2354,7 +2354,7 @@ void kdb_ps1(const struct task_struct *p) } else { if (KDB_TSK(cpu) != p) kdb_printf(" Error: does not match running " - "process table (0x%p)\n", KDB_TSK(cpu)); + "process table (0x%px)\n", KDB_TSK(cpu)); } } } @@ -2687,7 +2687,7 @@ int kdb_register_flags(char *cmd, for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kdb_printf("Duplicate kdb command registered: " - "%s, func %p help %s\n", cmd, func, help); + "%s, func %px help %s\n", cmd, func, help); return 1; } } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 990b3cc526c8..987eb73284d2 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -40,7 +40,7 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) { if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, + kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname, symtab); memset(symtab, 0, sizeof(*symtab)); symtab->sym_start = kallsyms_lookup_name(symname); @@ -88,7 +88,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) char *knt1 = NULL; if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); + kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) @@ -149,7 +149,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) symtab->mod_name = "kernel"; if (KDB_DEBUG(AR)) kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " - "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, + "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); @@ -887,13 +887,13 @@ void debug_kusage(void) __func__, dah_first); if (dah_first) { h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_printf("%s: h_used %p size %d\n", __func__, h_used, + kdb_printf("%s: h_used %px size %d\n", __func__, h_used, h_used->size); } do { h_used = (struct debug_alloc_header *) ((char *)h_free + dah_overhead + h_free->size); - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); h_free = (struct debug_alloc_header *) (debug_alloc_pool + h_free->next); @@ -902,7 +902,7 @@ void debug_kusage(void) ((char *)h_free + dah_overhead + h_free->size); if ((char *)h_used - debug_alloc_pool != sizeof(debug_alloc_pool_aligned)) - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); out: spin_unlock(&dap_lock); -- cgit v1.2.3 From c2b94c72d93d0929f48157eef128c4f9d2e603ce Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 20 Sep 2018 08:59:14 -0400 Subject: kdb: Use strscpy with destination buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 8.1.0 warns with: kernel/debug/kdb/kdb_support.c: In function ‘kallsyms_symbol_next’: kernel/debug/kdb/kdb_support.c:239:4: warning: ‘strncpy’ specified bound depends on the length of the source argument [-Wstringop-overflow=] strncpy(prefix_name, name, strlen(name)+1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/debug/kdb/kdb_support.c:239:31: note: length computed here Use strscpy() with the destination buffer size, and use ellipses when displaying truncated symbols. v2: Use strscpy() Signed-off-by: Prarit Bhargava Cc: Jonathan Toppins Cc: Jason Wessel Cc: Daniel Thompson Cc: kgdb-bugreport@lists.sourceforge.net Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 15 +++++++++------ kernel/debug/kdb/kdb_private.h | 2 +- kernel/debug/kdb/kdb_support.c | 10 +++++----- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index ed5d34925ad0..6a4b41484afe 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int count; int i; int diag, dtab_count; - int key; + int key, buf_size, ret; diag = kdbgetintenv("DTABCOUNT", &dtab_count); @@ -336,9 +336,8 @@ poll_again: else p_tmp = tmpbuffer; len = strlen(p_tmp); - count = kallsyms_symbol_complete(p_tmp, - sizeof(tmpbuffer) - - (p_tmp - tmpbuffer)); + buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer); + count = kallsyms_symbol_complete(p_tmp, buf_size); if (tab == 2 && count > 0) { kdb_printf("\n%d symbols are found.", count); if (count > dtab_count) { @@ -350,9 +349,13 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) + ret = kallsyms_symbol_next(p_tmp, i, buf_size); + if (WARN_ON(!ret)) break; - kdb_printf("%s ", p_tmp); + if (ret != -E2BIG) + kdb_printf("%s ", p_tmp); + else + kdb_printf("%s... ", p_tmp); *(p_tmp + len) = '\0'; } if (i >= dtab_count) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1e5a502ba4a7..2118d8258b7c 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -83,7 +83,7 @@ typedef struct __ksymtab { unsigned long sym_start; unsigned long sym_end; } kdb_symtab_t; -extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 987eb73284d2..b14b0925c184 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) * Parameters: * prefix_name prefix of a symbol name to lookup * flag 0 means search from the head, 1 means continue search. + * buf_size maximum length that can be written to prefix_name + * buffer * Returns: * 1 if a symbol matches the given prefix. * 0 if no string found */ -int kallsyms_symbol_next(char *prefix_name, int flag) +int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size) { int prefix_len = strlen(prefix_name); static loff_t pos; @@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag) pos = 0; while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strncpy(prefix_name, name, strlen(name)+1); - return 1; - } + if (!strncmp(name, prefix_name, prefix_len)) + return strscpy(prefix_name, name, buf_size); } return 0; } -- cgit v1.2.3 From 9eb62f0e1bc70ebc9b15837a0c4e8f12a7b910cb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Aug 2018 09:01:41 -0500 Subject: kdb: kdb_main: refactor code in kdb_md_line Replace the whole switch statement with a for loop. This makes the code clearer and easy to read. This also addresses the following Coverity warnings: Addresses-Coverity-ID: 115090 ("Missing break in switch") Addresses-Coverity-ID: 115091 ("Missing break in switch") Addresses-Coverity-ID: 114700 ("Missing break in switch") Suggested-by: Daniel Thompson Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson [daniel.thompson@linaro.org: Tiny grammar change in description] Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 959242084b40..d72b32c66f7d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1493,6 +1493,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, char cbuf[32]; char *c = cbuf; int i; + int j; unsigned long word; memset(cbuf, '\0', sizeof(cbuf)); @@ -1538,25 +1539,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, wc.word = word; #define printable_char(c) \ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) - switch (bytesperword) { - case 8: + for (j = 0; j < bytesperword; j++) *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 4; - case 4: - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 2; - case 2: - *c++ = printable_char(*cp++); - addr++; - case 1: - *c++ = printable_char(*cp++); - addr++; - break; - } + addr += bytesperword; #undef printable_char } } -- cgit v1.2.3 From 01cb37351bafc1b44b962842926210115e231f0a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 23:18:25 -0500 Subject: kdb: kdb_keyboard: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_keyboard.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 118527aa60ea..750497b0003a 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* drop through */ + /* fall through */ case KT_SPEC: if (keychar == K_ENTER) break; - /* drop through */ + /* fall through */ default: return -1; /* ignore unprintables */ } -- cgit v1.2.3 From 646558ff1643467d3b941b47f519867cbca462c3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 21:48:44 -0500 Subject: kdb: kdb_support: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_support.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index b14b0925c184..50bf9b119bad 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); -- cgit v1.2.3 From afd594240806acc138cf696c09f2f4829d55d02f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 16 Nov 2018 12:00:07 +0000 Subject: bpf: fix off-by-one error in adjust_subprog_starts When patching in a new sequence for the first insn of a subprog, the start of that subprog does not change (it's the first insn of the sequence), so adjust_subprog_starts should check start <= off (rather than < off). Also added a test to test_verifier.c (it's essentially the syz reproducer). Fixes: cc8b0b92a169 ("bpf: introduce function calls (function boundaries)") Reported-by: syzbot+4fc427c7af994b0948be@syzkaller.appspotmail.com Signed-off-by: Edward Cree Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_verifier.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1971ca325fb4..6dd419550aba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6f61df62f690..550b7e46bf4a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -13896,6 +13896,25 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "calls: ctx read at start of subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_JMP_REG(BPF_JSGT, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 569a933b03f3c48b392fe67c0086b3a6b9306b5a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 14 Nov 2018 10:00:34 -0800 Subject: bpf: allocate local storage buffers using GFP_ATOMIC Naresh reported an issue with the non-atomic memory allocation of cgroup local storage buffers: [ 73.047526] BUG: sleeping function called from invalid context at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-corei7-64/kernel-source/mm/slab.h:421 [ 73.060915] in_atomic(): 1, irqs_disabled(): 0, pid: 3157, name: test_cgroup_sto [ 73.068342] INFO: lockdep is turned off. [ 73.072293] CPU: 2 PID: 3157 Comm: test_cgroup_sto Not tainted 4.20.0-rc2-next-20181113 #1 [ 73.080548] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 73.088018] Call Trace: [ 73.090463] dump_stack+0x70/0xa5 [ 73.093783] ___might_sleep+0x152/0x240 [ 73.097619] __might_sleep+0x4a/0x80 [ 73.101191] __kmalloc_node+0x1cf/0x2f0 [ 73.105031] ? cgroup_storage_update_elem+0x46/0x90 [ 73.109909] cgroup_storage_update_elem+0x46/0x90 cgroup_storage_update_elem() (as well as other update map update callbacks) is called with disabled preemption, so GFP_ATOMIC allocation should be used: e.g. alloc_htab_elem() in hashtab.c. Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -139,7 +139,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOENT; new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; -- cgit v1.2.3 From 8fcb2312d1e3300e81aa871aad00d4c038cfc184 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Fri, 16 Nov 2018 15:08:00 -0800 Subject: kernel/sched/psi.c: simplify cgroup_move_task() The existing code triggered an invalid warning about 'rq' possibly being used uninitialized. Instead of doing the silly warning suppression by initializa it to NULL, refactor the code to bail out early instead. Warning was: kernel/sched/psi.c: In function `cgroup_move_task': kernel/sched/psi.c:639:13: warning: `rq' may be used uninitialized in this function [-Wmaybe-uninitialized] Link: http://lkml.kernel.org/r/20181103183339.8669-1-olof@lixom.net Fixes: 2ce7135adc9ad ("psi: cgroup support") Signed-off-by: Olof Johansson Reviewed-by: Andrew Morton Acked-by: Johannes Weiner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..3d7355d7c3e3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - bool move_psi = !psi_disabled; unsigned int task_flags = 0; struct rq_flags rf; struct rq *rq; - if (move_psi) { - rq = task_rq_lock(task, &rf); + if (psi_disabled) { + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + return; + } - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; + rq = task_rq_lock(task, &rf); - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; - if (task_flags) - psi_task_change(task, task_flags, 0); - } + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; - /* - * Lame to do this here, but the scheduler cannot be locked - * from the outside, so we move cgroups from inside sched/. - */ + if (task_flags) + psi_task_change(task, task_flags, 0); + + /* See comment above */ rcu_assign_pointer(task->cgroups, to); - if (move_psi) { - if (task_flags) - psi_task_change(task, 0, task_flags); + if (task_flags) + psi_task_change(task, 0, task_flags); - task_rq_unlock(rq, task, &rf); - } + task_rq_unlock(rq, task, &rf); } #endif /* CONFIG_CGROUPS */ -- cgit v1.2.3 From cb216b84d6ea24fa10f1e7aac35de77246841041 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 21 Nov 2018 16:00:51 +0000 Subject: swiotlb: Skip cache maintenance on map error If swiotlb_bounce_page() failed, calling arch_sync_dma_for_device() may lead to such delights as performing cache maintenance on whatever address phys_to_virt(SWIOTLB_MAP_ERROR) looks like, which is typically outside the kernel memory map and goes about as well as expected. Don't do that. Fixes: a4a4330db46a ("swiotlb: add support for non-coherent DMA") Tested-by: John Stultz Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5731daa09a32..045930e32c0e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -679,7 +679,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && - (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && + dev_addr != DIRECT_MAPPING_ERROR) arch_sync_dma_for_device(dev, phys, size, dir); return dev_addr; -- cgit v1.2.3 From 813961de3ee6474dd5703e883471fd941d6c8f69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Nov 2018 10:49:56 -0800 Subject: bpf: fix integer overflow in queue_stack_map Fix the following issues: - allow queue_stack_map for root only - fix u32 max_entries overflow - disallow value_size == 0 Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Wei Wu Signed-off-by: Alexei Starovoitov Cc: Mauricio Vasquez B Signed-off-by: Daniel Borkmann --- kernel/bpf/queue_stack_maps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); -- cgit v1.2.3 From 09d3f015d1e1b4fee7e9bbdcf54201d239393391 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 22 Nov 2018 17:10:31 +0100 Subject: uprobes: Fix handle_swbp() vs. unregister() + register() race once more Commit: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") added the UPROBE_COPY_INSN flag, and corresponding smp_wmb() and smp_rmb() memory barriers, to ensure that handle_swbp() uses fully-initialized uprobes only. However, the smp_rmb() is mis-placed: this barrier should be placed after handle_swbp() has tested for the flag, thus guaranteeing that (program-order) subsequent loads from the uprobe can see the initial stores performed by prepare_uprobe(). Move the smp_rmb() accordingly. Also amend the comments associated to the two memory barriers to indicate their actual locations. Signed-off-by: Andrea Parri Acked-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@kernel.org Fixes: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") Link: http://lkml.kernel.org/r/20181122161031.15179-1-andrea.parri@amarulasolutions.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 96d4bee83489..322e97bbb437 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -829,7 +829,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: @@ -2178,10 +2178,18 @@ static void handle_swbp(struct pt_regs *regs) * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ - smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* + * Pairs with the smp_wmb() in prepare_uprobe(). + * + * Guarantees that if we see the UPROBE_COPY_INSN bit set, then + * we must also see the stores to &uprobe->arch performed by the + * prepare_uprobe() call. + */ + smp_rmb(); + /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; -- cgit v1.2.3 From 1efb6ee3edea57f57f9fb05dba8dcb3f7333f61f Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Fri, 23 Nov 2018 17:43:26 +0100 Subject: bpf: fix check of allowed specifiers in bpf_trace_printk A format string consisting of "%p" or "%s" followed by an invalid specifier (e.g. "%p%\n" or "%s%") could pass the check which would make format_decode (lib/vsprintf.c) to warn. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: syzbot+1ec5c5ec949c4adaa0c4@syzkaller.appspotmail.com Signed-off-by: Martynas Pumputis Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08fcfe440c63..9864a35c8bb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; - i++; - if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + /* disallow any further format extensions */ + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) return -EINVAL; fmt_cnt++; - if (fmt[i - 1] == 's') { + if (fmt[i] == 's') { if (str_seen) /* allow only one '%s' per fmt string */ return -EINVAL; -- cgit v1.2.3 From 8114865ff82e200b383e46821c25cb0625b842b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 17:10:15 -0500 Subject: function_graph: Create function_graph_enter() to consolidate architecture code Currently all the architectures do basically the same thing in preparing the function graph tracer on entry to a function. This code can be pulled into a generic location and then this will allow the function graph tracer to be fixed, as well as extended. Create a new function graph helper function_graph_enter() that will call the hook function (ftrace_graph_entry) and the shadow stack operation (ftrace_push_return_trace), and remove the need of the architecture code to manage the shadow stack. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +++ kernel/trace/trace_functions_graph.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a397907e8d72..5717e8f81c59 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -779,6 +779,9 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +extern int +function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..28f2602435d0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -182,6 +182,22 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return 0; } +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return -EBUSY; + + return ftrace_push_return_trace(ret, func, &trace.depth, + frame_pointer, retp); +} + /* Retrieve a function return address to the trace stack on thread info.*/ static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, -- cgit v1.2.3 From e2c95a61656d29ceaac97b6a975c8a1f26e26f15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Nov 2018 14:05:38 +0100 Subject: bpf, ppc64: generalize fetching subprog into bpf_jit_get_func_addr Make fetching of the BPF call address from ppc64 JIT generic. ppc64 was using a slightly different variant rather than through the insns' imm field encoding as the target address would not fit into that space. Therefore, the target subprog number was encoded into the insns' offset and fetched through fp->aux->func[off]->bpf_func instead. Given there are other JITs with this issue and the mechanism of fetching the address is JIT-generic, move it into the core as a helper instead. On the JIT side, we get information on whether the retrieved address is a fixed one, that is, not changing through JIT passes, or a dynamic one. For the former, JITs can optimize their imm emission because this doesn't change jump offsets throughout JIT process. Signed-off-by: Daniel Borkmann Reviewed-by: Sandipan Das Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 57 ++++++++++++++++++++++++++------------- include/linux/filter.h | 4 +++ kernel/bpf/core.c | 34 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 50b129785aee..17482f5de3e2 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -166,7 +166,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -273,7 +299,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; - int i; + int i, ret; /* Start of epilogue code - will only be valid 2nd pass onwards */ u32 exit_addr = addrs[flen]; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -711,23 +738,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ - else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + else + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index de629b706d1d..448dcc448f1f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -866,6 +866,10 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..b1a3545d0ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) -- cgit v1.2.3 From d125f3f866df88da5a85df00291f88f0baa89f7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 07:40:39 -0500 Subject: function_graph: Make ftrace_push_return_trace() static As all architectures now call function_graph_enter() to do the entry work, no architecture should ever call ftrace_push_return_trace(). Make it static. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 --- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5717e8f81c59..dd16e8218db3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -776,9 +776,6 @@ struct ftrace_ret_stack { */ extern void return_to_handler(void); -extern int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer, unsigned long *retp); extern int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 28f2602435d0..88ca787a1cdc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -118,7 +118,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ -int +static int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp) { -- cgit v1.2.3 From 39eb456dacb543de90d3bc6a8e0ac5cf51ac475e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 08:07:12 -0500 Subject: function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack Currently, the depth of the ret_stack is determined by curr_ret_stack index. The issue is that there's a race between setting of the curr_ret_stack and calling of the callback attached to the return of the function. Commit 03274a3ffb44 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") moved the calling of the callback to after the setting of the curr_ret_stack, even stating that it was safe to do so, when in fact, it was the reason there was a barrier() there (yes, I should have commented that barrier()). Not only does the curr_ret_stack keep track of the current call graph depth, it also keeps the ret_stack content from being overwritten by new data. The function profiler, uses the "subtime" variable of ret_stack structure and by moving the curr_ret_stack, it allows for interrupts to use the same structure it was using, corrupting the data, and breaking the profiler. To fix this, there needs to be two variables to handle the call stack depth and the pointer to where the ret_stack is being used, as they need to change at two different locations. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/sched.h | 1 + kernel/trace/ftrace.c | 3 +++ kernel/trace/trace_functions_graph.c | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d6183a55e8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,6 +1116,7 @@ struct task_struct { #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; + int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..48513954713c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6814,6 +6814,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* Make sure the tasks see the -1 first: */ smp_wmb(); t->ret_stack = ret_stack_list[start++]; @@ -7038,6 +7039,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* * The idle task has no parent, it either has its own * stack or no stack at all. @@ -7068,6 +7070,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; t->curr_ret_stack = -1; + t->curr_ret_depth = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 88ca787a1cdc..02d4081a7f5a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, +ftrace_push_return_trace(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; @@ -177,8 +177,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR current->ret_stack[index].retp = retp; #endif - *depth = current->curr_ret_stack; - return 0; } @@ -188,14 +186,20 @@ int function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_graph_ent trace; trace.func = func; - trace.depth = current->curr_ret_stack + 1; + trace.depth = ++current->curr_ret_depth; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) - return -EBUSY; + goto out; - return ftrace_push_return_trace(ret, func, &trace.depth, - frame_pointer, retp); + if (ftrace_push_return_trace(ret, func, + frame_pointer, retp)) + goto out; + + return 0; + out: + current->curr_ret_depth--; + return -EBUSY; } /* Retrieve a function return address to the trace stack on thread info.*/ @@ -257,7 +261,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = index; + trace->depth = current->curr_ret_depth; } /* @@ -273,6 +277,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. -- cgit v1.2.3 From 552701dd0fa7c3d448142e87210590ba424694a0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 15:18:40 -0500 Subject: function_graph: Move return callback before update of curr_ret_stack In the past, curr_ret_stack had two functions. One was to denote the depth of the call graph, the other is to keep track of where on the ret_stack the data is used. Although they may be slightly related, there are two cases where they need to be used differently. The one case is that it keeps the ret_stack data from being corrupted by an interrupt coming in and overwriting the data still in use. The other is just to know where the depth of the stack currently is. The function profiler uses the ret_stack to save a "subtime" variable that is part of the data on the ret_stack. If curr_ret_stack is modified too early, then this variable can be corrupted. The "max_depth" option, when set to 1, will record the first functions going into the kernel. To see all top functions (when dealing with timings), the depth variable needs to be lowered before calling the return hook. But by lowering the curr_ret_stack, it makes the data on the ret_stack still being used by the return hook susceptible to being overwritten. Now that there's two variables to handle both cases (curr_ret_depth), we can move them to the locations where they can handle both cases. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 02d4081a7f5a..4f0d72ae6362 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -261,7 +261,13 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth; + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); } /* @@ -275,9 +281,14 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ barrier(); current->curr_ret_stack--; - current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. @@ -288,13 +299,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } - /* - * The trace should run after decrementing the ret counter - * in case an interrupt were to come in. We don't want to - * lose the interrupt if max_depth is set. - */ - ftrace_graph_return(&trace); - if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3 From 7c6ea35ef50810aa12ab26f21cb858d980881576 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Nov 2018 12:40:25 -0500 Subject: function_graph: Reverse the order of pushing the ret_stack and the callback The function graph profiler uses the ret_stack to store the "subtime" and reuse it by nested functions and also on the return. But the current logic has the profiler callback called before the ret_stack is updated, and it is just modifying the ret_stack that will later be allocated (it's just lucky that the "subtime" is not touched when it is allocated). This could also cause a crash if we are at the end of the ret_stack when this happens. By reversing the order of the allocating the ret_stack and then calling the callbacks attached to a function being traced, the ret_stack entry is no longer used before it is allocated. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4f0d72ae6362..2561460d7baf 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,15 +188,17 @@ int function_graph_enter(unsigned long ret, unsigned long func, trace.func = func; trace.depth = ++current->curr_ret_depth; - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - goto out; - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) goto out; + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + return 0; + out_ret: + current->curr_ret_stack--; out: current->curr_ret_depth--; return -EBUSY; -- cgit v1.2.3 From b1b35f2e218a5b57d03bbc3b0667d5064570dc60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Nov 2018 12:51:07 -0500 Subject: function_graph: Have profiler use curr_ret_stack and not depth The profiler uses trace->depth to find its entry on the ret_stack, but the depth may not match the actual location of where its entry is (if an interrupt were to preempt the processing of the profiler for another function, the depth and the curr_ret_stack will be different). Have it use the curr_ret_stack as the index to find its ret_stack entry instead of using the depth variable, as that is no longer guaranteed to be the same. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 48513954713c..77734451cb05 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -817,7 +817,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - int index = trace->depth; + int index = current->curr_ret_stack; function_profile_call(trace->func, 0, NULL, NULL); @@ -852,7 +852,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!fgraph_graph_time) { int index; - index = trace->depth; + index = current->curr_ret_stack; /* Append this call time to the parent time to subtract */ if (index) -- cgit v1.2.3 From c5511d03ec090980732e929c318a7a6374b5550e Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sun, 25 Nov 2018 19:33:36 +0100 Subject: sched/smt: Make sched_smt_present track topology Currently the 'sched_smt_present' static key is enabled when at CPU bringup SMT topology is observed, but it is never disabled. However there is demand to also disable the key when the topology changes such that there is no SMT present anymore. Implement this by making the key count the number of cores that have SMT enabled. In particular, the SMT topology bits are set before interrrupts are enabled and similarly, are cleared after interrupts are disabled for the last time and the CPU dies. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.246110444@linutronix.de --- kernel/sched/core.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu) #ifdef CONFIG_SCHED_SMT /* - * The sched_smt_present static key needs to be evaluated on every - * hotplug event because at boot time SMT might be disabled when - * the number of booted CPUs is limited. - * - * If then later a sibling gets hotplugged, then the key would stay - * off and SMT scheduling would never be functional. + * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) > 1) - static_branch_enable_cpuslocked(&sched_smt_present); + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif + if (!sched_smp_initialized) return 0; -- cgit v1.2.3 From 321a874a7ef85655e93b3206d0f36b4a6097f948 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:38 +0100 Subject: sched/smt: Expose sched_smt_present static key Make the scheduler's 'sched_smt_present' static key globaly available, so it can be used in the x86 speculation control code. Provide a query function and a stub for the CONFIG_SMP=n case. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.430168326@linutronix.de --- include/linux/sched/smt.h | 18 ++++++++++++++++++ kernel/sched/sched.h | 4 +--- 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 include/linux/sched/smt.h (limited to 'kernel') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000000..c9e0be514110 --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static inline bool sched_smt_active(void) { return false; } +#endif + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq) #ifdef CONFIG_SCHED_SMT - -extern struct static_key_false sched_smt_present; - extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) -- cgit v1.2.3 From a74cfffb03b73d41e08f84c2e5c87dec0ce3db9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:39 +0100 Subject: x86/speculation: Rework SMT state change arch_smt_update() is only called when the sysfs SMT control knob is changed. This means that when SMT is enabled in the sysfs control knob the system is considered to have SMT active even if all siblings are offline. To allow finegrained control of the speculation mitigations, the actual SMT state is more interesting than the fact that siblings could be enabled. Rework the code, so arch_smt_update() is invoked from each individual CPU hotplug function, and simplify the update function while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.521974984@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 11 +++++------ include/linux/sched/smt.h | 2 ++ kernel/cpu.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a723af0c4400..5625b323ff32 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -344,16 +345,14 @@ void arch_smt_update(void) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base; - if (cpu_smt_control == CPU_SMT_ENABLED) + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; - else - mask &= ~SPEC_CTRL_STIBP; if (mask != x86_spec_ctrl_base) { pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - cpu_smt_control == CPU_SMT_ENABLED ? - "Enabling" : "Disabling"); + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index c9e0be514110..59d3736c454c 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -15,4 +15,6 @@ static __always_inline bool sched_smt_active(void) static inline bool sched_smt_active(void) { return false; } #endif +void arch_smt_update(void); + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void) #endif /* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } + #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out: * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); + arch_smt_update(); return ret; } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); + arch_smt_update(); return ret; } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; - static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; -- cgit v1.2.3 From 46f7ecb1e7359f183f5bbd1e08b90e10e52164f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:50 +0100 Subject: ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS The IBPB control code in x86 removed the usage. Remove the functionality which was introduced for this. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.559149393@linutronix.de --- include/linux/ptrace.h | 17 ----------------- kernel/ptrace.c | 10 ---------- 2 files changed, 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 6c2ffed907f5..de20ede2c5c8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -64,15 +64,12 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 -#define PTRACE_MODE_SCHED 0x20 -#define PTRACE_MODE_IBPB 0x40 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) -#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) /** * ptrace_may_access - check whether the caller is permitted to access @@ -90,20 +87,6 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); -/** - * ptrace_may_access - check whether the caller is permitted to access - * a target task. - * @task: target task - * @mode: selects type of access and caller credentials - * - * Returns true on success, false on denial. - * - * Similar to ptrace_may_access(). Only to be called from context switch - * code. Does not call into audit and the regular LSM hooks due to locking - * constraints. - */ -extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); - static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 80b34dffdfb9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - if (mode & PTRACE_MODE_SCHED) - return false; - if (mode & PTRACE_MODE_NOAUDIT) return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); else @@ -331,16 +328,9 @@ ok: !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; - if (mode & PTRACE_MODE_SCHED) - return 0; return security_ptrace_access_check(task, mode); } -bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) -{ - return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); -} - bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; -- cgit v1.2.3 From 5cf99a0f3161bc3ae2391269d134d6bf7e26f00e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 08:50:27 -0500 Subject: tracing/fgraph: Fix set_graph_function from showing interrupts The tracefs file set_graph_function is used to only function graph functions that are listed in that file (or all functions if the file is empty). The way this is implemented is that the function graph tracer looks at every function, and if the current depth is zero and the function matches something in the file then it will trace that function. When other functions are called, the depth will be greater than zero (because the original function will be at depth zero), and all functions will be traced where the depth is greater than zero. The issue is that when a function is first entered, and the handler that checks this logic is called, the depth is set to zero. If an interrupt comes in and a function in the interrupt handler is traced, its depth will be greater than zero and it will automatically be traced, even if the original function was not. But because the logic only looks at depth it may trace interrupts when it should not be. The recent design change of the function graph tracer to fix other bugs caused the depth to be zero while the function graph callback handler is being called for a longer time, widening the race of this happening. This bug was actually there for a longer time, but because the race window was so small it seldom happened. The Fixes tag below is for the commit that widen the race window, because that commit belongs to a series that will also help fix the original bug. Cc: stable@kernel.org Fixes: 39eb456dacb5 ("function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack") Reported-by: Joe Lawrence Tested-by: Joe Lawrence Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 57 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace_functions_graph.c | 4 +++ kernel/trace/trace_irqsoff.c | 2 ++ kernel/trace/trace_sched_wakeup.c | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b8c0e24ab30..447bd96ee658 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -512,12 +512,44 @@ enum { * can only be modified by current, we can reuse trace_recursion. */ TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT @@ -843,8 +875,9 @@ extern void __trace_graph_return(struct trace_array *tr, extern struct ftrace_hash *ftrace_graph_hash; extern struct ftrace_hash *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { + unsigned long addr = trace->func; int ret = 0; preempt_disable_notrace(); @@ -855,6 +888,14 @@ static inline int ftrace_graph_addr(unsigned long addr) } if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + + /* + * This needs to be cleared on the return functions + * when the depth is zero. + */ + trace_recursion_set(TRACE_GRAPH_BIT); + trace_recursion_set_depth(trace->depth); + /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still @@ -872,6 +913,13 @@ out: return ret; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ + if (trace_recursion_test(TRACE_GRAPH_BIT) && + trace->depth == trace_recursion_depth()) + trace_recursion_clear(TRACE_GRAPH_BIT); +} + static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; @@ -885,7 +933,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return ret; } #else -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { return 1; } @@ -894,6 +942,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; @@ -901,7 +951,8 @@ extern unsigned int fgraph_max_depth; static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) { /* trace it when it is-nested-in or is a function enabled. */ - return !(trace->depth || ftrace_graph_addr(trace->func)) || + return !(trace_recursion_test(TRACE_GRAPH_BIT) || + ftrace_graph_addr(trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2561460d7baf..086af4f5c3e8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -509,6 +509,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; + ftrace_graph_addr_finish(trace); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -532,6 +534,8 @@ void set_graph_array(struct trace_array *tr) static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { + ftrace_graph_addr_finish(trace); + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b7357f9f82a3..98ea6d28df15 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -208,6 +208,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_dec(tr, &data, &flags)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a86b303e6c67..7d04b9890755 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; -- cgit v1.2.3 From ef1a8409348966f0b25ff97a170d6d0367710ea9 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Tue, 13 Nov 2018 00:08:48 +0300 Subject: stackleak: Disable function tracing and kprobes for stackleak_erase() The stackleak_erase() function is called on the trampoline stack at the end of syscall. This stack is not big enough for ftrace and kprobes operations, e.g. it can be exhausted if we use kprobe_events for stackleak_erase(). So let's disable function tracing and kprobes of stackleak_erase(). Reported-by: kernel test robot Fixes: 10e9ae9fabaf ("gcc-plugins: Add STACKLEAK plugin for tracking the kernel stack") Signed-off-by: Alexander Popov Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Masami Hiramatsu Signed-off-by: Kees Cook --- kernel/stackleak.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index e42892926244..08cb57eed389 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -11,6 +11,7 @@ */ #include +#include #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include @@ -47,7 +48,7 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void stackleak_erase(void) +asmlinkage void notrace stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -101,6 +102,7 @@ asmlinkage void stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } +NOKPROBE_SYMBOL(stackleak_erase); void __used stackleak_track_stack(void) { -- cgit v1.2.3 From e0c274472d5d27f277af722e017525e0b33784cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Nov 2018 14:09:58 -0800 Subject: psi: make disabling/enabling easier for vendor kernels Mel Gorman reports a hackbench regression with psi that would prohibit shipping the suse kernel with it default-enabled, but he'd still like users to be able to opt in at little to no cost to others. With the current combination of CONFIG_PSI and the psi_disabled bool set from the commandline, this is a challenge. Do the following things to make it easier: 1. Add a config option CONFIG_PSI_DEFAULT_DISABLED that allows distros to enable CONFIG_PSI in their kernel but leave the feature disabled unless a user requests it at boot-time. To avoid double negatives, rename psi_disabled= to psi=. 2. Make psi_disabled a static branch to eliminate any branch costs when the feature is disabled. In terms of numbers before and after this patch, Mel says: : The following is a comparision using CONFIG_PSI=n as a baseline against : your patch and a vanilla kernel : : 4.20.0-rc4 4.20.0-rc4 4.20.0-rc4 : kconfigdisable-v1r1 vanilla psidisable-v1r1 : Amean 1 1.3100 ( 0.00%) 1.3923 ( -6.28%) 1.3427 ( -2.49%) : Amean 3 3.8860 ( 0.00%) 4.1230 * -6.10%* 3.8860 ( -0.00%) : Amean 5 6.8847 ( 0.00%) 8.0390 * -16.77%* 6.7727 ( 1.63%) : Amean 7 9.9310 ( 0.00%) 10.8367 * -9.12%* 9.9910 ( -0.60%) : Amean 12 16.6577 ( 0.00%) 18.2363 * -9.48%* 17.1083 ( -2.71%) : Amean 18 26.5133 ( 0.00%) 27.8833 * -5.17%* 25.7663 ( 2.82%) : Amean 24 34.3003 ( 0.00%) 34.6830 ( -1.12%) 32.0450 ( 6.58%) : Amean 30 40.0063 ( 0.00%) 40.5800 ( -1.43%) 41.5087 ( -3.76%) : Amean 32 40.1407 ( 0.00%) 41.2273 ( -2.71%) 39.9417 ( 0.50%) : : It's showing that the vanilla kernel takes a hit (as the bisection : indicated it would) and that disabling PSI by default is reasonably : close in terms of performance for this particular workload on this : particular machine so; Link: http://lkml.kernel.org/r/20181127165329.GA29728@cmpxchg.org Signed-off-by: Johannes Weiner Tested-by: Mel Gorman Reported-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 3 ++- init/Kconfig | 9 ++++++++ kernel/sched/psi.c | 30 +++++++++++++++++-------- kernel/sched/stats.h | 8 +++---- 5 files changed, 40 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 675170c36078..5d6ba930d4f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,10 @@ before loading. See Documentation/blockdev/ramdisk.txt. + psi= [KNL] Enable or disable pressure stall information + tracking. + Format: + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 8e0725aac0aa..7006008d5b72 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H +#include #include #include @@ -9,7 +10,7 @@ struct css_set; #ifdef CONFIG_PSI -extern bool psi_disabled; +extern struct static_key_false psi_disabled; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index a4112e95724a..cf5b5a0dcbc2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -509,6 +509,15 @@ config PSI Say N if unsure. +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d7c3e3..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@ static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + } psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; *flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0; cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (unlikely(rq->curr->flags & PF_MEMSTALL)) -- cgit v1.2.3 From 903e8ff86753e6f327bb92166a0665e4ecb8e2e7 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 30 Nov 2018 14:10:05 -0800 Subject: kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace Since __sanitizer_cov_trace_pc() is marked as notrace, function calls in __sanitizer_cov_trace_pc() shouldn't be traced either. ftrace_graph_caller() gets called for each function that isn't marked 'notrace', like canonicalize_ip(). This is the call trace from a run: [ 139.644550] ftrace_graph_caller+0x1c/0x24 [ 139.648352] canonicalize_ip+0x18/0x28 [ 139.652313] __sanitizer_cov_trace_pc+0x14/0x58 [ 139.656184] sched_clock+0x34/0x1e8 [ 139.659759] trace_clock_local+0x40/0x88 [ 139.663722] ftrace_push_return_trace+0x8c/0x1f0 [ 139.667767] prepare_ftrace_return+0xa8/0x100 [ 139.671709] ftrace_graph_caller+0x1c/0x24 Rework so that check_kcov_mode() and canonicalize_ip() that are called from __sanitizer_cov_trace_pc() are also marked as notrace. Link: http://lkml.kernel.org/r/20181128081239.18317-1-anders.roxell@linaro.org Signed-off-by: Arnd Bergmann Signen-off-by: Anders Roxell Co-developed-by: Arnd Bergmann Acked-by: Steven Rostedt (VMware) Cc: Dmitry Vyukov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 3ebd09efe72a..97959d7b77e2 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -56,7 +56,7 @@ struct kcov { struct task_struct *t; }; -static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) return mode == needed_mode; } -static unsigned long canonicalize_ip(unsigned long ip) +static notrace unsigned long canonicalize_ip(unsigned long ip) { #ifdef CONFIG_RANDOMIZE_BASE ip -= kaslr_offset(); -- cgit v1.2.3