summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/bpf/btf.c19
-rw-r--r--kernel/bpf/core.c5
-rw-r--r--kernel/bpf/verifier.c13
-rw-r--r--kernel/cgroup/cgroup.c50
-rw-r--r--kernel/cgroup/cpuset.c63
-rw-r--r--kernel/dma/Kconfig3
-rw-r--r--kernel/dma/direct.c44
-rw-r--r--kernel/dma/swiotlb.c30
-rw-r--r--kernel/events/core.c87
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/locking/lockdep_proc.c5
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/memremap.c245
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/padata.c12
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c9
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/Kconfig6
-rw-r--r--kernel/ptrace.c101
-rw-r--r--kernel/resource.c88
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/signal.c69
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/sysctl.c215
-rw-r--r--kernel/trace/Kconfig12
-rw-r--r--kernel/trace/ftrace.c48
-rw-r--r--kernel/trace/ring_buffer.c17
-rw-r--r--kernel/trace/trace.c17
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/trace_events.c10
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_kprobe.c357
-rw-r--r--kernel/trace/trace_output.c9
-rw-r--r--kernel/trace/trace_probe.c142
-rw-r--r--kernel/trace/trace_probe.h77
-rw-r--r--kernel/trace/trace_probe_tmpl.h36
-rw-r--r--kernel/trace/trace_uprobe.c180
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/ucount.c6
44 files changed, 1288 insertions, 837 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index dc0b682ec2d9..fc020c09b7e8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY
Select this if you are building a kernel for a desktop system.
-config PREEMPT
+config PREEMPT_LL
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_COUNT
+ select PREEMPT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
@@ -55,7 +55,28 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_RT
+ bool "Fully Preemptible Kernel (Real-Time)"
+ depends on EXPERT && ARCH_SUPPORTS_RT
+ select PREEMPT
+ help
+ This option turns the kernel into a real-time kernel by replacing
+ various locking primitives (spinlocks, rwlocks, etc.) with
+ preemptible priority-inheritance aware variants, enforcing
+ interrupt threading and introducing mechanisms to break up long
+ non-preemptible sections. This makes the kernel, except for very
+ low level and critical code pathes (entry code, scheduler, low
+ level interrupt handling) fully preemptible and brings most
+ execution contexts under scheduler control.
+
+ Select this if you are building a kernel for systems which
+ require real-time guarantees.
+
endchoice
config PREEMPT_COUNT
bool
+
+config PREEMPT
+ bool
+ select PREEMPT_COUNT
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 546ebee39e2a..5fcc7a17eb5a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
!btf_type_is_var(size_type)))
return NULL;
- size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
+ else if (btf_type_has_size(size_type))
+ size = size_type->size;
+ else if (btf_type_is_array(size_type))
+ size = btf->resolved_sizes[size_type_id];
+ else if (btf_type_is_ptr(size_type))
+ size = sizeof(void *);
+ else
+ return NULL;
}
*type_id = size_type_id;
@@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
const struct btf_type *next_type;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
if (env_type_is_resolved(env, next_type_id))
next_type = btf_type_id_resolve(btf, &next_type_id);
@@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
}
}
- env_stack_pop_resolved(env, next_type_id, next_type_size);
+ env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
@@ -1645,7 +1651,6 @@ static int btf_var_resolve(struct btf_verifier_env *env,
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1675,12 +1680,12 @@ static int btf_var_resolve(struct btf_verifier_env *env,
* forward types or similar that would resolve to size of
* zero is allowed.
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
- env_stack_pop_resolved(env, next_type_id, next_type_size);
+ env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 16079550db6d..8191a7db2777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1295,11 +1295,11 @@ bool bpf_opcode_in_insntable(u8 code)
*
* Decode and execute eBPF instructions.
*/
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
- static const void *jumptable[256] = {
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
@@ -1558,7 +1558,6 @@ out:
BUG_ON(1);
return 0;
}
-STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2e763703c30..5900cbb966b1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1519,9 +1519,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
return -EFAULT;
}
*stack_mask |= 1ull << spi;
- } else if (class == BPF_STX) {
+ } else if (class == BPF_STX || class == BPF_ST) {
if (*reg_mask & dreg)
- /* stx shouldn't be using _scalar_ dst_reg
+ /* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
*/
@@ -1540,7 +1540,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (!(*stack_mask & (1ull << spi)))
return 0;
*stack_mask &= ~(1ull << spi);
- *reg_mask |= sreg;
+ if (class == BPF_STX)
+ *reg_mask |= sreg;
} else if (class == BPF_JMP || class == BPF_JMP32) {
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -1569,10 +1570,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (mode == BPF_IND || mode == BPF_ABS)
/* to be analyzed */
return -ENOTSUPP;
- } else if (class == BPF_ST) {
- if (*reg_mask & dreg)
- /* likely pointer subtraction */
- return -ENOTSUPP;
}
return 0;
}
@@ -6106,11 +6103,13 @@ static int check_return_code(struct bpf_verifier_env *env)
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
range = tnum_range(1, 1);
+ break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
range = tnum_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
+ break;
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 300b0c416341..753afbca549f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2201,8 +2201,7 @@ static int cgroup_init_fs_context(struct fs_context *fc)
fc->ops = &cgroup_fs_context_ops;
else
fc->ops = &cgroup1_fs_context_ops;
- if (fc->user_ns)
- put_user_ns(fc->user_ns);
+ put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
return 0;
@@ -2243,6 +2242,50 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
+#ifdef CONFIG_CPUSETS
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cgroup1_get_tree,
+ .free = cgroup_fs_context_free,
+};
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
+ struct cgroup_fs_context *ctx;
+ int err;
+
+ err = cgroup_init_fs_context(fc);
+ if (err) {
+ kfree(agent);
+ return err;
+ }
+
+ fc->ops = &cpuset_fs_context_ops;
+
+ ctx = cgroup_fc2context(fc);
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ ctx->release_agent = agent;
+
+ get_filesystem(&cgroup_fs_type);
+ put_filesystem(fc->fs_type);
+ fc->fs_type = &cgroup_fs_type;
+
+ return 0;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+#endif
+
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
@@ -5761,6 +5804,9 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
+#ifdef CONFIG_CPUSETS
+ WARN_ON(register_filesystem(&cpuset_fs_type));
+#endif
return 0;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3b02b9c4405..5aa37531ce76 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -356,59 +356,6 @@ static inline bool is_in_v2_mode(void)
}
/*
- * This is ugly, but preserves the userspace API for existing cpuset
- * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead
- */
-static int cpuset_get_tree(struct fs_context *fc)
-{
- struct file_system_type *cgroup_fs;
- struct fs_context *new_fc;
- int ret;
-
- cgroup_fs = get_fs_type("cgroup");
- if (!cgroup_fs)
- return -ENODEV;
-
- new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
- if (IS_ERR(new_fc)) {
- ret = PTR_ERR(new_fc);
- } else {
- static const char agent_path[] = "/sbin/cpuset_release_agent";
- ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
- if (!ret)
- ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
- if (!ret)
- ret = vfs_parse_fs_string(new_fc, "release_agent",
- agent_path, sizeof(agent_path) - 1);
- if (!ret)
- ret = vfs_get_tree(new_fc);
- if (!ret) { /* steal the result */
- fc->root = new_fc->root;
- new_fc->root = NULL;
- }
- put_fs_context(new_fc);
- }
- put_filesystem(cgroup_fs);
- return ret;
-}
-
-static const struct fs_context_operations cpuset_fs_context_ops = {
- .get_tree = cpuset_get_tree,
-};
-
-static int cpuset_init_fs_context(struct fs_context *fc)
-{
- fc->ops = &cpuset_fs_context_ops;
- return 0;
-}
-
-static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .init_fs_context = cpuset_init_fs_context,
-};
-
-/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus.
@@ -729,7 +676,7 @@ static inline int nr_cpusets(void)
* load balancing domains (sched domains) as specified by that partial
* partition.
*
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
@@ -2853,13 +2800,11 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
/**
* cpuset_init - initialize cpusets at system boot
*
- * Description: Initialize top_cpuset and the cpuset internal file system,
+ * Description: Initialize top_cpuset
**/
int __init cpuset_init(void)
{
- int err = 0;
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2873,10 +2818,6 @@ int __init cpuset_init(void)
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
- err = register_filesystem(&cpuset_fs_type);
- if (err < 0)
- return err;
-
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 70f8f8d9200e..9decbba255fc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
config ARCH_HAS_DMA_MMAP_PGPROT
bool
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ bool
+
config DMA_NONCOHERENT_CACHE_SYNC
bool
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b90e1aede743..59bdceea3737 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -23,14 +23,6 @@
#define ARCH_ZONE_DMA_BITS 24
#endif
-/*
- * For AMD SEV all DMA must be to unencrypted addresses.
- */
-static inline bool force_dma_unencrypted(void)
-{
- return sev_active();
-}
-
static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
if (!dev->dma_mask) {
@@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
{
- if (force_dma_unencrypted())
+ if (force_dma_unencrypted(dev))
return __phys_to_dma(dev, phys);
return phys_to_dma(dev, phys);
}
@@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
dma_mask = dev->bus_dma_mask;
- if (force_dma_unencrypted())
+ if (force_dma_unencrypted(dev))
*phys_mask = __dma_to_phys(dev, dma_mask);
else
*phys_mask = dma_to_phys(dev, dma_mask);
@@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted()) {
+ if (force_dma_unencrypted(dev)) {
set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
*dma_handle = __phys_to_dma(dev, page_to_phys(page));
} else {
@@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
return;
}
- if (force_dma_unencrypted())
+ if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
@@ -242,12 +234,14 @@ void dma_direct_sync_sg_for_device(struct device *dev,
int i;
for_each_sg(sgl, sg, nents, i) {
- if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
- swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, sg->length,
dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+ arch_sync_dma_for_device(dev, paddr, sg->length,
dir);
}
}
@@ -279,11 +273,13 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
int i;
for_each_sg(sgl, sg, nents, i) {
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
-
- if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
- swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir,
+ arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
SYNC_FOR_CPU);
}
@@ -407,11 +403,9 @@ int dma_direct_supported(struct device *dev, u64 mask)
size_t dma_direct_max_mapping_size(struct device *dev)
{
- size_t size = SIZE_MAX;
-
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active())
- size = swiotlb_max_mapping_size(dev);
-
- return size;
+ if (is_swiotlb_active() &&
+ (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ return swiotlb_max_mapping_size(dev);
+ return SIZE_MAX;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 62fa5a82a065..9de232229063 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -129,15 +129,17 @@ setup_io_tlb_npages(char *str)
}
early_param("swiotlb", setup_io_tlb_npages);
+static bool no_iotlb_memory;
+
unsigned long swiotlb_nr_tbl(void)
{
- return io_tlb_nslabs;
+ return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
}
EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
unsigned int swiotlb_max_segment(void)
{
- return max_segment;
+ return unlikely(no_iotlb_memory) ? 0 : max_segment;
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
@@ -160,8 +162,6 @@ unsigned long swiotlb_size_or_default(void)
return size ? size : (IO_TLB_DEFAULT_SIZE);
}
-static bool no_iotlb_memory;
-
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
@@ -317,6 +317,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
return rc;
}
+static void swiotlb_cleanup(void)
+{
+ io_tlb_end = 0;
+ io_tlb_start = 0;
+ io_tlb_nslabs = 0;
+ max_segment = 0;
+}
+
int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
@@ -367,10 +375,7 @@ cleanup4:
sizeof(int)));
io_tlb_list = NULL;
cleanup3:
- io_tlb_end = 0;
- io_tlb_start = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
+ swiotlb_cleanup();
return -ENOMEM;
}
@@ -394,10 +399,7 @@ void __init swiotlb_exit(void)
memblock_free_late(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
- io_tlb_start = 0;
- io_tlb_end = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
+ swiotlb_cleanup();
}
/*
@@ -546,7 +548,7 @@ not_found:
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
size, io_tlb_nslabs, tmp_io_tlb_used);
- return DMA_MAPPING_ERROR;
+ return (phys_addr_t)DMA_MAPPING_ERROR;
found:
io_tlb_used += nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
@@ -664,7 +666,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
*phys, size, dir, attrs);
- if (*phys == DMA_MAPPING_ERROR)
+ if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
return false;
/* Ensure that the address returned is DMA'ble */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 785d708f8553..026a14541a38 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2553,6 +2553,9 @@ unlock:
return ret;
}
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx);
+
/*
* Attach a performance event to a context.
*
@@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
+ WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
if (event->cpu != -1)
event->cpu = cpu;
@@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return 0;
/*
@@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return;
/* see comment in exclusive_event_init() */
@@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
return false;
}
-/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!is_exclusive_pmu(pmu))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -4465,12 +4471,20 @@ static void _free_event(struct perf_event *event)
if (event->destroy)
event->destroy(event);
- if (event->ctx)
- put_ctx(event->ctx);
-
+ /*
+ * Must be after ->destroy(), due to uprobe_perf_close() using
+ * hw.target.
+ */
if (event->hw.target)
put_task_struct(event->hw.target);
+ /*
+ * perf_event_free_task() relies on put_ctx() being 'last', in particular
+ * all task references must be cleaned up.
+ */
+ if (event->ctx)
+ put_ctx(event->ctx);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
@@ -4650,8 +4664,17 @@ again:
mutex_unlock(&event->child_mutex);
list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ void *var = &child->ctx->refcount;
+
list_del(&child->child_list);
free_event(child);
+
+ /*
+ * Wake any perf_event_free_task() waiting for this event to be
+ * freed.
+ */
+ smp_mb(); /* pairs with wait_var_event() */
+ wake_up_var(var);
}
no_ctx:
@@ -10930,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
- if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
- err = -EBUSY;
- goto err_context;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -11022,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
move_group = 0;
}
}
+
+ /*
+ * Failure to create exclusive events returns -EBUSY.
+ */
+ err = -EBUSY;
+ if (!exclusive_event_installable(group_leader, ctx))
+ goto err_locked;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (!exclusive_event_installable(sibling, ctx))
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -11058,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
* because we need to serialize with concurrent event creation.
*/
if (!exclusive_event_installable(event, ctx)) {
- /* exclusive and group stuff are assumed mutually exclusive */
- WARN_ON_ONCE(move_group);
-
err = -EBUSY;
goto err_locked;
}
@@ -11527,11 +11554,11 @@ static void perf_free_event(struct perf_event *event,
}
/*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
*
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -11561,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task)
perf_free_event(event, ctx);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
}
@@ -11575,9 +11618,7 @@ void perf_event_delayed_put(struct task_struct *task)
struct file *perf_event_get(unsigned int fd)
{
- struct file *file;
-
- file = fget_raw(fd);
+ struct file *file = fget(fd);
if (!file)
return ERR_PTR(-EBADF);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f3e2d97d771..d8ae0f1b4148 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
- hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -2406,6 +2405,16 @@ long _do_fork(struct kernel_clone_args *args)
return nr;
}
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((kargs->flags & CLONE_PIDFD) &&
+ (kargs->flags & CLONE_PARENT_SETTID))
+ return false;
+
+ return true;
+}
+
#ifndef CONFIG_HAVE_COPY_THREAD_TLS
/* For compatibility with architectures that call do_fork directly rather than
* using the syscall entry points below. */
@@ -2417,6 +2426,7 @@ long do_fork(unsigned long clone_flags,
{
struct kernel_clone_args args = {
.flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
.child_tid = child_tidptr,
.parent_tid = parent_tidptr,
.exit_signal = (clone_flags & CSIGNAL),
@@ -2424,6 +2434,9 @@ long do_fork(unsigned long clone_flags,
.stack_size = stack_size,
};
+ if (!legacy_clone_args_valid(&args))
+ return -EINVAL;
+
return _do_fork(&args);
}
#endif
@@ -2505,8 +2518,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+ if (!legacy_clone_args_valid(&args))
return -EINVAL;
return _do_fork(&args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f5433a52488..9873fc627d61 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2276,6 +2276,7 @@ static int __init init_kprobes(void)
init_test_probes();
return err;
}
+subsys_initcall(init_kprobes);
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
@@ -2588,5 +2589,3 @@ static int __init debugfs_kprobe_init(void)
late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */
-
-module_init(init_kprobes);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 9c49ec645d8b..65b6a1600c8f 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0;
+#ifdef CONFIG_PROVE_LOCKING
list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (class->usage_mask == 0)
@@ -241,13 +242,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
-#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
-#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
+
+#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0c601ae072b3..edd1c082dbf5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -16,7 +16,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..fa83d36e30c6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -9,7 +9,7 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
- * See Documentation/locking/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.rst for details.
*/
#include <linux/spinlock.h>
#include <linux/export.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6e1970719dc2..6ee03a816d67 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,41 +11,39 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
-#include <linux/hmm.h>
static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
- unsigned long addr,
- swp_entry_t entry,
- unsigned int flags,
- pmd_t *pmdp)
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void *data)
{
- struct page *page = device_private_entry_to_page(entry);
- struct hmm_devmem *devmem;
+ if (atomic_dec_and_test(&devmap_managed_enable))
+ static_branch_disable(&devmap_managed_key);
+}
- devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+ if (!pgmap->ops || !pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return -EINVAL;
+ }
- /*
- * The page_fault() callback must migrate page back to system memory
- * so that CPU can access it. This might fail for various reasons
- * (device issue, device was unsafely unplugged, ...). When such
- * error conditions happen, the callback must return VM_FAULT_SIGBUS.
- *
- * Note that because memory cgroup charges are accounted to the device
- * memory, this should never fail because of memory restrictions (but
- * allocation of regular system page might still fail because we are
- * out of memory).
- *
- * There is a more in-depth description of what that callback can and
- * cannot do, in include/linux/memremap.h
- */
- return devmem->page_fault(vma, addr, page, flags, pmdp);
+ if (atomic_inc_return(&devmap_managed_enable) == 1)
+ static_branch_enable(&devmap_managed_key);
+ return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
}
-#endif /* CONFIG_DEVICE_PRIVATE */
+#else
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
static void pgmap_array_delete(struct resource *res)
{
@@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
- const struct resource *res = &pgmap->res;
- struct vmem_altmap *altmap = &pgmap->altmap;
- unsigned long pfn;
-
- pfn = res->start >> PAGE_SHIFT;
- if (pgmap->altmap_valid)
- pfn += vmem_altmap_offset(altmap);
- return pfn;
+ return PHYS_PFN(pgmap->res.start) +
+ vmem_altmap_offset(pgmap_altmap(pgmap));
}
static unsigned long pfn_end(struct dev_pagemap *pgmap)
@@ -83,59 +75,81 @@ static unsigned long pfn_next(unsigned long pfn)
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->kill)
+ pgmap->ops->kill(pgmap);
+ else
+ percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->cleanup) {
+ pgmap->ops->cleanup(pgmap);
+ } else {
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(pgmap->ref);
+ }
+}
+
static void devm_memremap_pages_release(void *data)
{
struct dev_pagemap *pgmap = data;
struct device *dev = pgmap->dev;
struct resource *res = &pgmap->res;
- resource_size_t align_start, align_size;
unsigned long pfn;
int nid;
- pgmap->kill(pgmap->ref);
+ dev_pagemap_kill(pgmap);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
- pgmap->cleanup(pgmap->ref);
+ dev_pagemap_cleanup(pgmap);
/* pages are dead and unused, undo the arch mapping */
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- - align_start;
-
- nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
+ nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start)));
mem_hotplug_begin();
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- pfn = align_start >> PAGE_SHIFT;
+ pfn = PHYS_PFN(res->start);
__remove_pages(page_zone(pfn_to_page(pfn)), pfn,
- align_size >> PAGE_SHIFT, NULL);
+ PHYS_PFN(resource_size(res)), NULL);
} else {
- arch_remove_memory(nid, align_start, align_size,
- pgmap->altmap_valid ? &pgmap->altmap : NULL);
- kasan_remove_zero_shadow(__va(align_start), align_size);
+ arch_remove_memory(nid, res->start, resource_size(res),
+ pgmap_altmap(pgmap));
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
pgmap_array_delete(res);
dev_WARN_ONCE(dev, pgmap->altmap.alloc,
"%s: failed to free all reserved pages\n", __func__);
}
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+ struct dev_pagemap *pgmap =
+ container_of(ref, struct dev_pagemap, internal_ref);
+
+ complete(&pgmap->done);
+}
+
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
* @pgmap: pointer to a struct dev_pagemap
*
* Notes:
- * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
* by the caller before passing it to this function
*
- * 2/ The altmap field may optionally be initialized, in which case altmap_valid
- * must be set to true
+ * 2/ The altmap field may optionally be initialized, in which case
+ * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
*
- * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- * at devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ * 'live' on entry and will be killed and reaped at
+ * devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -143,31 +157,69 @@ static void devm_memremap_pages_release(void *data)
*/
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
- resource_size_t align_start, align_size, align_end;
- struct vmem_altmap *altmap = pgmap->altmap_valid ?
- &pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
struct mhp_restrictions restrictions = {
/*
* We do not want any optional features only our own memmap
- */
- .altmap = altmap,
+ */
+ .altmap = pgmap_altmap(pgmap),
};
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
+ bool need_devmap_managed = true;
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+ WARN(1, "Device private memory not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+ WARN(1, "Missing migrate_to_ram method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_FS_DAX:
+ if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+ IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ WARN(1, "File system DAX not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_DEVDAX:
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ need_devmap_managed = false;
+ break;
+ default:
+ WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+ break;
+ }
+
+ if (!pgmap->ref) {
+ if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+ return ERR_PTR(-EINVAL);
- if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->internal_ref,
+ dev_pagemap_percpu_release, 0, GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
+ pgmap->ref = &pgmap->internal_ref;
+ } else {
+ if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
+ return ERR_PTR(-EINVAL);
+ }
}
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- - align_start;
- align_end = align_start + align_size - 1;
+ if (need_devmap_managed) {
+ error = devmap_managed_enable_get(dev, pgmap);
+ if (error)
+ return ERR_PTR(error);
+ }
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
@@ -175,7 +227,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_array;
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
@@ -183,7 +235,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_array;
}
- is_ram = region_intersects(align_start, align_size,
+ is_ram = region_intersects(res->start, resource_size(res),
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
if (is_ram != REGION_DISJOINT) {
@@ -204,8 +256,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
- align_size);
+ error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
+ resource_size(res));
if (error)
goto err_pfn_remap;
@@ -223,16 +275,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
* arch_add_memory().
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, &restrictions);
+ error = add_pages(nid, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), &restrictions);
} else {
- error = kasan_add_zero_shadow(__va(align_start), align_size);
+ error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
if (error) {
mem_hotplug_done();
goto err_kasan;
}
- error = arch_add_memory(nid, align_start, align_size,
+ error = arch_add_memory(nid, res->start, resource_size(res),
&restrictions);
}
@@ -240,8 +292,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
struct zone *zone;
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, altmap);
+ move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), restrictions.altmap);
}
mem_hotplug_done();
@@ -253,8 +305,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
* to allow us to do the work while not holding the hotplug lock.
*/
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, pgmap);
+ PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), pgmap);
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
@@ -265,15 +317,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
return __va(res->start);
err_add_memory:
- kasan_remove_zero_shadow(__va(align_start), align_size);
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
err_pfn_remap:
pgmap_array_delete(res);
err_array:
- pgmap->kill(pgmap->ref);
- pgmap->cleanup(pgmap->ref);
-
+ dev_pagemap_kill(pgmap);
+ dev_pagemap_cleanup(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
@@ -287,7 +338,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
- return altmap->reserve + altmap->free;
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
}
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
@@ -329,28 +382,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_enable;
-
-/*
- * Toggle the static key for ->page_free() callbacks when dev_pagemap
- * pages go idle.
- */
-void dev_pagemap_get_ops(void)
-{
- if (atomic_inc_return(&devmap_enable) == 1)
- static_branch_enable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
-
-void dev_pagemap_put_ops(void)
-{
- if (atomic_dec_and_test(&devmap_enable))
- static_branch_disable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-
void __put_devmap_managed_page(struct page *page)
{
int count = page_ref_dec_return(page);
@@ -366,7 +397,7 @@ void __put_devmap_managed_page(struct page *page)
mem_cgroup_uncharge(page);
- page->pgmap->page_free(page, page->pgmap->data);
+ page->pgmap->ops->page_free(page);
} else if (!count)
__put_page(page);
}
diff --git a/kernel/module.c b/kernel/module.c
index a2cee14a83f3..5933395af9a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1492,8 +1492,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++)
if (!sect_empty(&info->sechdrs[i]))
nloaded++;
- size[0] = ALIGN(sizeof(*sect_attrs)
- + nloaded * sizeof(sect_attrs->attrs[0]),
+ size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
sizeof(sect_attrs->grp.attrs[0]));
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
@@ -1697,6 +1696,8 @@ static int add_usage_links(struct module *mod)
return ret;
}
+static void module_remove_modinfo_attrs(struct module *mod, int end);
+
static int module_add_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
@@ -1711,24 +1712,34 @@ static int module_add_modinfo_attrs(struct module *mod)
return -ENOMEM;
temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (!attr->test || attr->test(mod)) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
error = sysfs_create_file(&mod->mkobj.kobj,
&temp_attr->attr);
+ if (error)
+ goto error_out;
++temp_attr;
}
}
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
return error;
}
-static void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod, int end)
{
struct module_attribute *attr;
int i;
for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
@@ -1816,7 +1827,7 @@ static int mod_sysfs_setup(struct module *mod,
return 0;
out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod);
+ module_remove_modinfo_attrs(mod, -1);
out_unreg_param:
module_param_sysfs_remove(mod);
out_unreg_holders:
@@ -1852,7 +1863,7 @@ static void mod_sysfs_fini(struct module *mod)
{
}
-static void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod, int end)
{
}
@@ -1868,14 +1879,14 @@ static void init_param_lock(struct module *mod)
static void mod_sysfs_teardown(struct module *mod)
{
del_usage_links(mod);
- module_remove_modinfo_attrs(mod);
+ module_remove_modinfo_attrs(mod, -1);
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
kobject_put(mod->holders_dir);
mod_sysfs_fini(mod);
}
-#ifdef CONFIG_STRICT_MODULE_RWX
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
@@ -1898,6 +1909,7 @@ static void frob_text(const struct module_layout *layout,
layout->text_size >> PAGE_SHIFT);
}
+#ifdef CONFIG_STRICT_MODULE_RWX
static void frob_rodata(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
@@ -1949,13 +1961,9 @@ void module_enable_ro(const struct module *mod, bool after_init)
set_vm_flush_reset_perms(mod->core_layout.base);
set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
- frob_text(&mod->core_layout, set_memory_x);
frob_rodata(&mod->core_layout, set_memory_ro);
-
frob_text(&mod->init_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_x);
-
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@@ -2014,9 +2022,19 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
-#else
+#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
-#endif
+#endif /* CONFIG_STRICT_MODULE_RWX */
+static void module_enable_x(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_x);
+ frob_text(&mod->init_layout, set_memory_x);
+}
+#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+static void module_enable_nx(const struct module *mod) { }
+static void module_enable_x(const struct module *mod) { }
+#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+
#ifdef CONFIG_LIVEPATCH
/*
@@ -2723,6 +2741,11 @@ void * __weak module_alloc(unsigned long size)
return vmalloc_exec(size);
}
+bool __weak module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit");
+}
+
#ifdef CONFIG_DEBUG_KMEMLEAK
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
@@ -2912,7 +2935,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
- if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+ if (module_exit_section(info->secstrings+shdr->sh_name))
shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
@@ -3390,8 +3413,7 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE
- || mod->state == MODULE_STATE_GOING;
+ ret = !mod || mod->state == MODULE_STATE_LIVE;
mutex_unlock(&module_mutex);
return ret;
@@ -3581,8 +3603,7 @@ again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
+ if (old->state != MODULE_STATE_LIVE) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
@@ -3621,6 +3642,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_ro(mod, false);
module_enable_nx(mod);
+ module_enable_x(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
diff --git a/kernel/padata.c b/kernel/padata.c
index 2d2fddbb7a4c..15a8ad63f4ff 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -267,7 +267,12 @@ static void padata_reorder(struct parallel_data *pd)
* The next object that needs serialization might have arrived to
* the reorder queues in the meantime, we will be called again
* from the timer function if no one else cares for it.
+ *
+ * Ensure reorder_objects is read after pd->lock is dropped so we see
+ * an increment from another task in padata_do_serial. Pairs with
+ * smp_mb__after_atomic in padata_do_serial.
*/
+ smp_mb();
if (atomic_read(&pd->reorder_objects)
&& !(pinst->flags & PADATA_RESET))
mod_timer(&pd->timer, jiffies + HZ);
@@ -387,6 +392,13 @@ void padata_do_serial(struct padata_priv *padata)
list_add_tail(&padata->list, &pqueue->reorder.list);
spin_unlock(&pqueue->reorder.lock);
+ /*
+ * Ensure the atomic_inc of reorder_objects above is ordered correctly
+ * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
+ * in padata_reorder.
+ */
+ smp_mb__after_atomic();
+
put_cpu();
/* If we're running on the wrong CPU, call padata_reorder() via a
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d9f55bf7d38..057540b6eee9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -372,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
/**
* print_tainted - return a string to represent the kernel taint state.
*
- * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
+ * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
*
* The string is overwritten by the next call to print_tainted(),
* but is always NULL terminated.
diff --git a/kernel/pid.c b/kernel/pid.c
index 16263b526560..0a9f2e437217 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,14 +37,14 @@
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
struct pid init_struct_pid = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
@@ -108,8 +108,7 @@ void put_pid(struct pid *pid)
return;
ns = pid->numbers[pid->level].ns;
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
get_pid_ns(ns);
- atomic_set(&pid->count, 1);
+ refcount_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d726cef241c..a6a79f85c81a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
}
extern int pid_max;
-static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &pid_max,
},
{ }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ff8592ddedee..d3667b4075c1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -66,7 +66,7 @@ config HIBERNATION
need to run mkswap against the swap partition used for the suspend.
It also works with swap files to a limited extent (for details see
- <file:Documentation/power/swsusp-and-swap-files.txt>).
+ <file:Documentation/power/swsusp-and-swap-files.rst>).
Right now you may boot without resuming and resume later but in the
meantime you cannot use the swap partition(s)/file(s) involved in
@@ -75,7 +75,7 @@ config HIBERNATION
MOUNT any journaled filesystems mounted before the suspend or they
will get corrupted in a nasty way.
- For more information take a look at <file:Documentation/power/swsusp.txt>.
+ For more information take a look at <file:Documentation/power/swsusp.rst>.
config ARCH_SAVE_PAGE_KEYS
bool
@@ -256,7 +256,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/apm-acpi.txt>
+ and more information, read <file:Documentation/power/apm-acpi.rst>
and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83a531cea2f3..cb9ddcc08119 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <asm/syscall.h> /* for syscall_get_* */
+
/*
* Access another process' address space via ptrace.
* Source/target buffer must be kernel space,
@@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
* to ensure no machine forgets it.
*/
EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int i;
+
+ info->op = PTRACE_SYSCALL_INFO_ENTRY;
+ info->entry.nr = syscall_get_nr(child, regs);
+ syscall_get_arguments(child, regs, args);
+ for (i = 0; i < ARRAY_SIZE(args); i++)
+ info->entry.args[i] = args[i];
+
+ /* args is the last field in struct ptrace_syscall_info.entry */
+ return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * As struct ptrace_syscall_info.entry is currently a subset
+ * of struct ptrace_syscall_info.seccomp, it makes sense to
+ * initialize that subset using ptrace_get_syscall_info_entry().
+ * This can be reconsidered in the future if these structures
+ * diverge significantly enough.
+ */
+ ptrace_get_syscall_info_entry(child, regs, info);
+ info->op = PTRACE_SYSCALL_INFO_SECCOMP;
+ info->seccomp.ret_data = child->ptrace_message;
+
+ /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ info->op = PTRACE_SYSCALL_INFO_EXIT;
+ info->exit.rval = syscall_get_error(child, regs);
+ info->exit.is_error = !!info->exit.rval;
+ if (!info->exit.is_error)
+ info->exit.rval = syscall_get_return_value(child, regs);
+
+ /* is_error is the last field in struct ptrace_syscall_info.exit */
+ return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info = {
+ .op = PTRACE_SYSCALL_INFO_NONE,
+ .arch = syscall_get_arch(child),
+ .instruction_pointer = instruction_pointer(regs),
+ .stack_pointer = user_stack_pointer(regs),
+ };
+ unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+ unsigned long write_size;
+
+ /*
+ * This does not need lock_task_sighand() to access
+ * child->last_siginfo because ptrace_freeze_traced()
+ * called earlier by ptrace_check_attach() ensures that
+ * the tracee cannot go away and clear its last_siginfo.
+ */
+ switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+ case SIGTRAP | 0x80:
+ switch (child->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ actual_size = ptrace_get_syscall_info_entry(child, regs,
+ &info);
+ break;
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ actual_size = ptrace_get_syscall_info_exit(child, regs,
+ &info);
+ break;
+ }
+ break;
+ case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+ actual_size = ptrace_get_syscall_info_seccomp(child, regs,
+ &info);
+ break;
+ }
+
+ write_size = min(actual_size, user_size);
+ return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = __put_user(kiov.iov_len, &uiov->iov_len);
break;
}
+
+ case PTRACE_GET_SYSCALL_INFO:
+ ret = ptrace_get_syscall_info(child, addr, datavp);
+ break;
#endif
case PTRACE_SECCOMP_GET_FILTER:
diff --git a/kernel/resource.c b/kernel/resource.c
index 158f04ec1d4f..7ea4306503c5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
@@ -342,6 +342,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{
+ bool siblings_only = true;
struct resource *p;
if (!res)
@@ -352,29 +353,43 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
- if ((p->flags & flags) != flags)
- continue;
- if ((desc != IORES_DESC_NONE) && (desc != p->desc))
- continue;
+ for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+ /* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
break;
}
- if ((p->end >= start) && (p->start <= end))
- break;
+
+ /* Skip until we find a range that matches what we look for */
+ if (p->end < start)
+ continue;
+
+ /*
+ * Now that we found a range that matches what we look for,
+ * check the flags and the descriptor. If we were not asked to
+ * use only the first level, start looking at children as well.
+ */
+ siblings_only = first_lvl;
+
+ if ((p->flags & flags) != flags)
+ continue;
+ if ((desc != IORES_DESC_NONE) && (desc != p->desc))
+ continue;
+
+ /* Found a match, break */
+ break;
+ }
+
+ if (p) {
+ /* copy data */
+ res->start = max(start, p->start);
+ res->end = min(end, p->end);
+ res->flags = p->flags;
+ res->desc = p->desc;
}
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
@@ -1628,6 +1643,45 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
+#ifdef CONFIG_DEVICE_PRIVATE
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ resource_size_t end, addr;
+ struct resource *res;
+
+ size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
+ end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
+ addr = end - size + 1UL;
+
+ for (; addr > size && addr >= base->start; addr -= size) {
+ if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
+ continue;
+
+ res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+ res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ return res;
+ }
+
+ return ERR_PTR(-ERANGE);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa43ce3962e7..2b037f195473 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
- return false;
+ goto out;
success = 1;
cpu = task_cpu(p);
@@ -2526,6 +2527,7 @@ unlock:
out:
if (success)
ttwu_stat(p, cpu, wake_flags);
+ preempt_enable();
return success;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index dabe100d2091..91b789dd6e72 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(&kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
diff --git a/kernel/smp.c b/kernel/smp.c
index 616d4d114847..7dbcb402c2fc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -291,6 +291,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
+ /*
+ * When @wait we can deadlock when we interrupt between llist_add() and
+ * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+ * csd_lock() on because the interrupt context uses the same csd
+ * storage.
+ */
+ WARN_ON_ONCE(!in_task());
+
csd = &csd_stack;
if (!wait) {
csd = this_cpu_ptr(&csd_data);
@@ -416,6 +424,14 @@ void smp_call_function_many(const struct cpumask *mask,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress && !early_boot_irqs_disabled);
+ /*
+ * When @wait we can deadlock when we interrupt between llist_add() and
+ * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+ * csd_lock() on because the interrupt context uses the same csd
+ * storage.
+ */
+ WARN_ON_ONCE(!in_task());
+
/* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index e6a02b274b73..f5440abb7532 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
+ mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
+ fs = get_fs();
+ set_fs(USER_DS);
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+ set_fs(fs);
+
return c.len;
}
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c1ad1e14f21..078950d9605b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,6 @@ static int sixty = 60;
#endif
static int __maybe_unused neg_one = -1;
-
-static int zero;
-static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long zero_ul;
@@ -188,17 +185,17 @@ extern int no_unaligned_warning;
* enum sysctl_writes_mode - supported sysctl write modes
*
* @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
- * to be written, and multiple writes on the same sysctl file descriptor
- * will rewrite the sysctl value, regardless of file position. No warning
- * is issued when the initial position is not 0.
+ * to be written, and multiple writes on the same sysctl file descriptor
+ * will rewrite the sysctl value, regardless of file position. No warning
+ * is issued when the initial position is not 0.
* @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
- * not 0.
+ * not 0.
* @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
- * file position 0 and the value must be fully contained in the buffer
- * sent to the write syscall. If dealing with strings respect the file
- * position, but restrict this to the max length of the buffer, anything
- * passed the max lenght will be ignored. Multiple writes will append
- * to the buffer.
+ * file position 0 and the value must be fully contained in the buffer
+ * sent to the write syscall. If dealing with strings respect the file
+ * position, but restrict this to the max length of the buffer, anything
+ * passed the max length will be ignored. Multiple writes will append
+ * to the buffer.
*
* These write modes control how current file position affects the behavior of
* updating sysctl values through the proc interface on each write.
@@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
@@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "numa_balancing",
@@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
@@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_energy_aware_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_PROVE_LOCKING
@@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &neg_one,
- .extra2 = &one,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_LATENCYTOP
@@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_MODULES
@@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_UEVENT_HELPER
@@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &ten_thousand,
},
{
@@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "kptr_restrict",
@@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
#endif
@@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "watchdog_thresh",
@@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &sixty,
},
{
@@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = NMI_WATCHDOG_SYSCTL_PERM,
.proc_handler = proc_nmi_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "watchdog_cpumask",
@@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_soft_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "softlockup_panic",
@@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_SMP
{
@@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
#endif
@@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_SMP
{
@@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
#endif
@@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "hung_task_check_count",
@@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "hung_task_timeout_secs",
@@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = perf_proc_update_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "perf_cpu_time_max_percent",
@@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
.mode = 0644,
.proc_handler = perf_cpu_time_max_percent_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_max_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &six_hundred_forty_kb,
},
{
@@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_thousand,
},
#endif
@@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
{
@@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
@@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "bpf_stats_enabled",
@@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_panic_on_rcu_stall),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
@@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = stack_erasing_sysctl,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "dirty_background_ratio",
@@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirty_background_ratio),
.mode = 0644,
.proc_handler = dirty_background_ratio_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_dirty_ratio),
.mode = 0644,
.proc_handler = dirty_ratio_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "dirtytime_expire_seconds",
@@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirtytime_expire_interval),
.mode = 0644,
.proc_handler = dirtytime_interval_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "swappiness",
@@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_swappiness),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
#ifdef CONFIG_HUGETLB_PAGE
@@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sysctl_vm_numa_stat_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
@@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
.extra2 = &four,
},
#ifdef CONFIG_COMPACTION
@@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_COMPACTION */
@@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = min_free_kbytes_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_boost_factor",
@@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
.proc_handler = watermark_boost_factor_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_scale_factor",
@@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(watermark_scale_factor),
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
.extra2 = &one_thousand,
},
{
@@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#ifdef CONFIG_MMU
{
@@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_max_map_count),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#else
{
@@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_nr_trim_pages),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
{
@@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(block_dump),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "vfs_cache_pressure",
@@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_vfs_cache_pressure),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
{
@@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_legacy_va_layout),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
#ifdef CONFIG_NUMA
@@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "min_unmapped_ratio",
@@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_min_unmapped_ratio),
.mode = 0644,
.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_min_slab_ratio),
.mode = 0644,
.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
#endif
@@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = {
#endif
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
#ifdef CONFIG_HIGHMEM
@@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_highmem_is_dirtyable),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_MEMORY_FAILURE
@@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_memory_failure_early_kill),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "memory_failure_recovery",
@@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_memory_failure_recovery),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
@@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_unprivileged_userfaultfd),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "protected_hardlinks",
@@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "protected_fifos",
@@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_coredump,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
@@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{ }
};
@@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_kprobes_optimization_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write,
.data = &val,
.maxlen = sizeof(val),
.mode = table->mode,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
};
if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 564e5fdb025f..98da8998c25c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -597,9 +597,19 @@ config FTRACE_STARTUP_TEST
functioning properly. It will do tests on all the configured
tracers of ftrace.
+config EVENT_TRACE_STARTUP_TEST
+ bool "Run selftest on trace events"
+ depends on FTRACE_STARTUP_TEST
+ default y
+ help
+ This option performs a test on all trace events in the system.
+ It basically just enables each event and runs some code that
+ will trigger events (not necessarily the event it enables)
+ This may take some time run as there are a lot of events.
+
config EVENT_TRACE_TEST_SYSCALLS
bool "Run selftest on syscall events"
- depends on FTRACE_STARTUP_TEST
+ depends on EVENT_TRACE_STARTUP_TEST
help
This option will also enable testing every syscall event.
It only enables the event and disables it and runs various loads
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 576c41644e77..eca34503f178 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1622,6 +1622,11 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
return keep_regs;
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1750,15 +1755,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
}
/*
- * If the rec had TRAMP enabled, then it needs to
- * be cleared. As TRAMP can only be enabled iff
- * there is only a single ops attached to it.
- * In otherwords, always disable it on decrementing.
- * In the future, we may set it if rec count is
- * decremented to one, and the ops that is left
- * has a trampoline.
+ * The TRAMP needs to be set only if rec count
+ * is decremented to one, and the ops that is
+ * left has a trampoline. As TRAMP can only be
+ * enabled if there is only a single ops attached
+ * to it.
*/
- rec->flags &= ~FTRACE_FL_TRAMP;
+ if (ftrace_rec_count(rec) == 1 &&
+ ftrace_find_tramp_ops_any(rec))
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
/*
* flags will be cleared in ftrace_check_record()
@@ -1768,7 +1775,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
count++;
/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
- update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+ update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE;
/* Shortcut, if we handled all records, we are done. */
if (!all && count == hash->count)
@@ -1951,11 +1958,6 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
-static struct ftrace_ops *
-ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
-static struct ftrace_ops *
-ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
-
enum ftrace_bug_type ftrace_bug_type;
const void *ftrace_expected;
@@ -2047,7 +2049,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
}
}
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
{
unsigned long flag = 0UL;
@@ -2146,28 +2148,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
/**
* ftrace_update_record, set a record that now is tracing or not
* @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
+ * @enable: set to true if the record is tracing, false to force disable
*
* The records that represent all functions that can be traced need
* to be updated when tracing has been enabled.
*/
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
{
- return ftrace_check_record(rec, enable, 1);
+ return ftrace_check_record(rec, enable, true);
}
/**
* ftrace_test_record, check if the record has been enabled or not
* @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
+ * @enable: set to true to check if enabled, false if it is disabled
*
* The arch code may need to test if a record is already set to
* tracing to determine how to modify the function code that it
* represents.
*/
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable)
{
- return ftrace_check_record(rec, enable, 0);
+ return ftrace_check_record(rec, enable, false);
}
static struct ftrace_ops *
@@ -2356,7 +2358,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
}
static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_old_addr;
unsigned long ftrace_addr;
@@ -2395,7 +2397,7 @@ void __weak ftrace_replace_code(int mod_flags)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+ bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
int failed;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05b0b3139ebc..66358d66c933 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -128,16 +128,7 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -2373,7 +2364,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ if (length > RB_MAX_SMALL_DATA) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2388,11 +2379,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+ if (length > RB_MAX_SMALL_DATA)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ARCH_ALIGNMENT);
+ length = ALIGN(length, RB_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c90c687cf950..525a97fbbc60 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -366,7 +366,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
}
/**
- * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
+ * trace_filter_add_remove_task - Add or remove a task from a pid_list
* @pid_list: The list to modify
* @self: The current task for fork or NULL for exit
* @task: The task to add or remove
@@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event,
{
struct trace_entry *ent = ring_buffer_event_data(event);
- tracing_generic_entry_update(ent, flags, pc);
- ent->type = type;
+ tracing_generic_entry_update(ent, type, flags, pc);
}
static __always_inline struct ring_buffer_event *
@@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
EXPORT_SYMBOL_GPL(trace_handle_return);
void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
- int pc)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
+ unsigned long flags, int pc)
{
struct task_struct *tsk = current;
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
+ entry->type = type;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -4842,12 +4842,13 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>\n"
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#else
- "\t $stack<index>, $stack, $retval, $comm\n"
+ "\t $stack<index>, $stack, $retval, $comm,\n"
#endif
+ "\t +|-[u]<offset>(<fetcharg>)\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
- "\t b<bit-width>@<bit-offset>/<container-size>,\n"
+ "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a6104474..0892e38ed6fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type)
unsigned long flags;
local_save_flags(flags);
- tracing_generic_entry_update(entry, flags, pc);
- entry->type = type;
+ tracing_generic_entry_update(entry, type, flags, pc);
}
NOKPROBE_SYMBOL(perf_trace_buf_update);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0ce3db67f556..c7506bc81b75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system)
#define while_for_each_event_file() \
}
-static struct list_head *
-trace_get_fields(struct trace_event_call *event_call)
-{
- if (!event_call->class->get_fields)
- return &event_call->class->fields;
- return event_call->class->get_fields(event_call);
-}
-
static struct ftrace_event_field *
__find_event_field(struct list_head *head, char *name)
{
@@ -3190,7 +3182,7 @@ void __init trace_event_init(void)
event_trace_enable();
}
-#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST
static DEFINE_SPINLOCK(test_spinlock);
static DEFINE_SPINLOCK(test_spinlock_irq);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 5079d1db3754..c773b8fb270c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1084,6 +1084,9 @@ int filter_assign_type(const char *type)
if (strchr(type, '[') && strstr(type, "char"))
return FILTER_STATIC_STRING;
+ if (strcmp(type, "char *") == 0 || strcmp(type, "const char *") == 0)
+ return FILTER_PTR_STRING;
+
return FILTER_OTHER;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7d736248a070..9d483ad9bb6c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -12,6 +12,8 @@
#include <linux/rculist.h>
#include <linux/error-injection.h>
+#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
+
#include "trace_dynevent.h"
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
@@ -19,6 +21,18 @@
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
+#define MAX_KPROBE_CMDLINE_SIZE 1024
+
+/* Kprobe early definition from command line */
+static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
+static bool kprobe_boot_events_enabled __initdata;
+
+static int __init set_kprobe_boot_events(char *str)
+{
+ strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+ return 0;
+}
+__setup("kprobe_event=", set_kprobe_boot_events);
static int trace_kprobe_create(int argc, const char **argv);
static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -128,8 +142,8 @@ static bool trace_kprobe_match(const char *system, const char *event,
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
- return strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
- (!system || strcmp(tk->tp.call.class->system, system) == 0);
+ return strcmp(trace_probe_name(&tk->tp), event) == 0 &&
+ (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0);
}
static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
@@ -143,6 +157,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
return nhit;
}
+static nokprobe_inline bool trace_kprobe_is_registered(struct trace_kprobe *tk)
+{
+ return !(list_empty(&tk->rp.kp.list) &&
+ hlist_unhashed(&tk->rp.kp.hlist));
+}
+
/* Return 0 if it fails to find the symbol address */
static nokprobe_inline
unsigned long trace_kprobe_address(struct trace_kprobe *tk)
@@ -183,6 +203,16 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
static int kretprobe_dispatcher(struct kretprobe_instance *ri,
struct pt_regs *regs);
+static void free_trace_kprobe(struct trace_kprobe *tk)
+{
+ if (tk) {
+ trace_probe_cleanup(&tk->tp);
+ kfree(tk->symbol);
+ free_percpu(tk->nhit);
+ kfree(tk);
+ }
+}
+
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -220,49 +250,20 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->rp.kp.pre_handler = kprobe_dispatcher;
tk->rp.maxactive = maxactive;
+ INIT_HLIST_NODE(&tk->rp.kp.hlist);
+ INIT_LIST_HEAD(&tk->rp.kp.list);
- if (!event || !group) {
- ret = -EINVAL;
- goto error;
- }
-
- tk->tp.call.class = &tk->tp.class;
- tk->tp.call.name = kstrdup(event, GFP_KERNEL);
- if (!tk->tp.call.name)
- goto error;
-
- tk->tp.class.system = kstrdup(group, GFP_KERNEL);
- if (!tk->tp.class.system)
+ ret = trace_probe_init(&tk->tp, event, group);
+ if (ret < 0)
goto error;
dyn_event_init(&tk->devent, &trace_kprobe_ops);
- INIT_LIST_HEAD(&tk->tp.files);
return tk;
error:
- kfree(tk->tp.call.name);
- kfree(tk->symbol);
- free_percpu(tk->nhit);
- kfree(tk);
+ free_trace_kprobe(tk);
return ERR_PTR(ret);
}
-static void free_trace_kprobe(struct trace_kprobe *tk)
-{
- int i;
-
- if (!tk)
- return;
-
- for (i = 0; i < tk->tp.nr_args; i++)
- traceprobe_free_probe_arg(&tk->tp.args[i]);
-
- kfree(tk->tp.call.class->system);
- kfree(tk->tp.call.name);
- kfree(tk->symbol);
- free_percpu(tk->nhit);
- kfree(tk);
-}
-
static struct trace_kprobe *find_trace_kprobe(const char *event,
const char *group)
{
@@ -270,8 +271,8 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
struct trace_kprobe *tk;
for_each_trace_kprobe(tk, pos)
- if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
- strcmp(tk->tp.call.class->system, group) == 0)
+ if (strcmp(trace_probe_name(&tk->tp), event) == 0 &&
+ strcmp(trace_probe_group_name(&tk->tp), group) == 0)
return tk;
return NULL;
}
@@ -280,7 +281,7 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
{
int ret = 0;
- if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
+ if (trace_kprobe_is_registered(tk) && !trace_kprobe_has_gone(tk)) {
if (trace_kprobe_is_return(tk))
ret = enable_kretprobe(&tk->rp);
else
@@ -297,34 +298,27 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
- struct event_file_link *link;
+ bool enabled = trace_probe_is_enabled(&tk->tp);
int ret = 0;
if (file) {
- link = kmalloc(sizeof(*link), GFP_KERNEL);
- if (!link) {
- ret = -ENOMEM;
- goto out;
- }
-
- link->file = file;
- list_add_tail_rcu(&link->list, &tk->tp.files);
+ ret = trace_probe_add_file(&tk->tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE);
- tk->tp.flags |= TP_FLAG_TRACE;
- ret = __enable_trace_kprobe(tk);
- if (ret) {
- list_del_rcu(&link->list);
- kfree(link);
- tk->tp.flags &= ~TP_FLAG_TRACE;
- }
+ if (enabled)
+ return 0;
- } else {
- tk->tp.flags |= TP_FLAG_PROFILE;
- ret = __enable_trace_kprobe(tk);
- if (ret)
- tk->tp.flags &= ~TP_FLAG_PROFILE;
+ ret = __enable_trace_kprobe(tk);
+ if (ret) {
+ if (file)
+ trace_probe_remove_file(&tk->tp, file);
+ else
+ trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE);
}
- out:
+
return ret;
}
@@ -335,54 +329,34 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
static int
disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
- struct event_file_link *link = NULL;
- int wait = 0;
+ struct trace_probe *tp = &tk->tp;
int ret = 0;
if (file) {
- link = find_event_file_link(&tk->tp, file);
- if (!link) {
- ret = -EINVAL;
- goto out;
- }
-
- list_del_rcu(&link->list);
- wait = 1;
- if (!list_empty(&tk->tp.files))
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
goto out;
-
- tk->tp.flags &= ~TP_FLAG_TRACE;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
} else
- tk->tp.flags &= ~TP_FLAG_PROFILE;
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
- if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
+ if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) {
if (trace_kprobe_is_return(tk))
disable_kretprobe(&tk->rp);
else
disable_kprobe(&tk->rp.kp);
- wait = 1;
}
- /*
- * if tk is not added to any list, it must be a local trace_kprobe
- * created with perf_event_open. We don't need to wait for these
- * trace_kprobes
- */
- if (list_empty(&tk->devent.list))
- wait = 0;
out:
- if (wait) {
+ if (file)
/*
- * Synchronize with kprobe_trace_func/kretprobe_trace_func
- * to ensure disabled (all running handlers are finished).
- * This is not only for kfree(), but also the caller,
- * trace_remove_event_call() supposes it for releasing
- * event_call related objects, which will be accessed in
- * the kprobe_trace_func/kretprobe_trace_func.
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
*/
- synchronize_rcu();
- kfree(link); /* Ignored if link == NULL */
- }
+ trace_probe_remove_file(tp, file);
return ret;
}
@@ -415,7 +389,7 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
{
int i, ret;
- if (trace_probe_is_registered(&tk->tp))
+ if (trace_kprobe_is_registered(tk))
return -EINVAL;
if (within_notrace_func(tk)) {
@@ -441,21 +415,20 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
else
ret = register_kprobe(&tk->rp.kp);
- if (ret == 0)
- tk->tp.flags |= TP_FLAG_REGISTERED;
return ret;
}
/* Internal unregister function - just handle k*probes and flags */
static void __unregister_trace_kprobe(struct trace_kprobe *tk)
{
- if (trace_probe_is_registered(&tk->tp)) {
+ if (trace_kprobe_is_registered(tk)) {
if (trace_kprobe_is_return(tk))
unregister_kretprobe(&tk->rp);
else
unregister_kprobe(&tk->rp.kp);
- tk->tp.flags &= ~TP_FLAG_REGISTERED;
- /* Cleanup kprobe for reuse */
+ /* Cleanup kprobe for reuse and mark it unregistered */
+ INIT_HLIST_NODE(&tk->rp.kp.hlist);
+ INIT_LIST_HEAD(&tk->rp.kp.list);
if (tk->rp.kp.symbol_name)
tk->rp.kp.addr = NULL;
}
@@ -487,8 +460,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&event_mutex);
/* Delete old (same name) event if exist */
- old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
- tk->tp.call.class->system);
+ old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
+ trace_probe_group_name(&tk->tp));
if (old_tk) {
ret = unregister_trace_kprobe(old_tk);
if (ret < 0)
@@ -541,7 +514,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
ret = __register_trace_kprobe(tk);
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
- trace_event_name(&tk->tp.call),
+ trace_probe_name(&tk->tp),
mod->name, ret);
}
}
@@ -716,6 +689,10 @@ static int trace_kprobe_create(int argc, const char *argv[])
goto error; /* This can be -ENOMEM */
}
+ ret = traceprobe_set_print_fmt(&tk->tp, is_return);
+ if (ret < 0)
+ goto error;
+
ret = register_trace_kprobe(tk);
if (ret) {
trace_probe_log_set_index(1);
@@ -767,8 +744,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
int i;
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tk->tp.call.class->system,
- trace_event_name(&tk->tp.call));
+ seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp),
+ trace_probe_name(&tk->tp));
if (!tk->symbol)
seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -842,7 +819,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
tk = to_trace_kprobe(ev);
seq_printf(m, " %-44s %15lu %15lu\n",
- trace_event_name(&tk->tp.call),
+ trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
tk->rp.kp.nmissed);
@@ -886,6 +863,15 @@ fetch_store_strlen(unsigned long addr)
return (ret < 0) ? ret : len;
}
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE);
+}
+
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
* length and relative data location.
@@ -894,19 +880,46 @@ static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
- u8 *dst = get_loc_data(dest, base);
+ void *__dest;
long ret;
if (unlikely(!maxlen))
return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
/*
* Try to get string again, since the string can be changed while
* probing.
*/
- ret = strncpy_from_unsafe(dst, (void *)addr, maxlen);
+ ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen);
if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, (void *)dst - base);
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
return ret;
}
@@ -916,6 +929,14 @@ probe_mem_read(void *dest, void *src, size_t size)
return probe_kernel_read(dest, src, size);
}
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return probe_user_read(dest, uaddr, size);
+}
+
/* Note that we don't verify it, since the code does not come from user space */
static int
process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
@@ -971,7 +992,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
struct ring_buffer *buffer;
int size, dsize, pc;
unsigned long irq_flags;
- struct trace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tk->tp);
WARN_ON(call != trace_file->event_call);
@@ -1003,7 +1024,7 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct event_file_link *link;
- list_for_each_entry_rcu(link, &tk->tp.files, list)
+ trace_probe_for_each_link_rcu(link, &tk->tp)
__kprobe_trace_func(tk, regs, link->file);
}
NOKPROBE_SYMBOL(kprobe_trace_func);
@@ -1019,7 +1040,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct ring_buffer *buffer;
int size, pc, dsize;
unsigned long irq_flags;
- struct trace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tk->tp);
WARN_ON(call != trace_file->event_call);
@@ -1053,7 +1074,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
{
struct event_file_link *link;
- list_for_each_entry_rcu(link, &tk->tp.files, list)
+ trace_probe_for_each_link_rcu(link, &tk->tp)
__kretprobe_trace_func(tk, ri, regs, link->file);
}
NOKPROBE_SYMBOL(kretprobe_trace_func);
@@ -1070,7 +1091,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1097,7 +1118,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1149,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
static int
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
- struct trace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tk->tp);
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
@@ -1199,7 +1220,7 @@ static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
- struct trace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tk->tp);
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
@@ -1299,10 +1320,10 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
raw_cpu_inc(*tk->nhit);
- if (tk->tp.flags & TP_FLAG_TRACE)
+ if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
kprobe_trace_func(tk, regs);
#ifdef CONFIG_PERF_EVENTS
- if (tk->tp.flags & TP_FLAG_PROFILE)
+ if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
ret = kprobe_perf_func(tk, regs);
#endif
return ret;
@@ -1316,10 +1337,10 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
raw_cpu_inc(*tk->nhit);
- if (tk->tp.flags & TP_FLAG_TRACE)
+ if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
kretprobe_trace_func(tk, ri, regs);
#ifdef CONFIG_PERF_EVENTS
- if (tk->tp.flags & TP_FLAG_PROFILE)
+ if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
kretprobe_perf_func(tk, ri, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
@@ -1334,10 +1355,10 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
-static inline void init_trace_event_call(struct trace_kprobe *tk,
- struct trace_event_call *call)
+static inline void init_trace_event_call(struct trace_kprobe *tk)
{
- INIT_LIST_HEAD(&call->class->fields);
+ struct trace_event_call *call = trace_probe_event_call(&tk->tp);
+
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
call->class->define_fields = kretprobe_event_define_fields;
@@ -1353,37 +1374,14 @@ static inline void init_trace_event_call(struct trace_kprobe *tk,
static int register_kprobe_event(struct trace_kprobe *tk)
{
- struct trace_event_call *call = &tk->tp.call;
- int ret = 0;
-
- init_trace_event_call(tk, call);
+ init_trace_event_call(tk);
- if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
- return -ENOMEM;
- ret = register_trace_event(&call->event);
- if (!ret) {
- kfree(call->print_fmt);
- return -ENODEV;
- }
- ret = trace_add_event_call(call);
- if (ret) {
- pr_info("Failed to register kprobe event: %s\n",
- trace_event_name(call));
- kfree(call->print_fmt);
- unregister_trace_event(&call->event);
- }
- return ret;
+ return trace_probe_register_event_call(&tk->tp);
}
static int unregister_kprobe_event(struct trace_kprobe *tk)
{
- int ret;
-
- /* tp->event is unregistered in trace_remove_event_call() */
- ret = trace_remove_event_call(&tk->tp.call);
- if (!ret)
- kfree(tk->tp.call.print_fmt);
- return ret;
+ return trace_probe_unregister_event_call(&tk->tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1413,7 +1411,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
return ERR_CAST(tk);
}
- init_trace_event_call(tk, &tk->tp.call);
+ init_trace_event_call(tk);
if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
ret = -ENOMEM;
@@ -1421,12 +1419,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
}
ret = __register_trace_kprobe(tk);
- if (ret < 0) {
- kfree(tk->tp.call.print_fmt);
+ if (ret < 0)
goto error;
- }
- return &tk->tp.call;
+ return trace_probe_event_call(&tk->tp);
error:
free_trace_kprobe(tk);
return ERR_PTR(ret);
@@ -1445,11 +1441,50 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
__unregister_trace_kprobe(tk);
- kfree(tk->tp.call.print_fmt);
free_trace_kprobe(tk);
}
#endif /* CONFIG_PERF_EVENTS */
+static __init void enable_boot_kprobe_events(void)
+{
+ struct trace_array *tr = top_trace_array();
+ struct trace_event_file *file;
+ struct trace_kprobe *tk;
+ struct dyn_event *pos;
+
+ mutex_lock(&event_mutex);
+ for_each_trace_kprobe(tk, pos) {
+ list_for_each_entry(file, &tr->events, list)
+ if (file->event_call == trace_probe_event_call(&tk->tp))
+ trace_event_enable_disable(file, 1, 0);
+ }
+ mutex_unlock(&event_mutex);
+}
+
+static __init void setup_boot_kprobe_events(void)
+{
+ char *p, *cmd = kprobe_boot_events_buf;
+ int ret;
+
+ strreplace(kprobe_boot_events_buf, ',', ' ');
+
+ while (cmd && *cmd != '\0') {
+ p = strchr(cmd, ';');
+ if (p)
+ *p++ = '\0';
+
+ ret = trace_run_command(cmd, create_or_delete_trace_kprobe);
+ if (ret)
+ pr_warn("Failed to add event(%d): %s\n", ret, cmd);
+ else
+ kprobe_boot_events_enabled = true;
+
+ cmd = p;
+ }
+
+ enable_boot_kprobe_events();
+}
+
/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
@@ -1481,6 +1516,9 @@ static __init int init_kprobe_trace(void)
if (!entry)
pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
+
+ setup_boot_kprobe_events();
+
return 0;
}
fs_initcall(init_kprobe_trace);
@@ -1493,7 +1531,7 @@ find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list)
- if (file->event_call == &tk->tp.call)
+ if (file->event_call == trace_probe_event_call(&tk->tp))
return file;
return NULL;
@@ -1513,6 +1551,11 @@ static __init int kprobe_trace_self_tests_init(void)
if (tracing_is_disabled())
return -ENODEV;
+ if (kprobe_boot_events_enabled) {
+ pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n");
+ return 0;
+ }
+
target = kprobe_trace_selftest_target;
pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ba751f993c3b..cab4a5398f1d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1109,17 +1109,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
unsigned long ip = field->caller[i];
- if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
+ if (!ip || trace_seq_has_overflowed(s))
break;
trace_seq_puts(s, " => ");
-
- if (!ip) {
- trace_seq_puts(s, "??");
- trace_seq_putc(s, '\n');
- continue;
- }
-
seq_print_user_ip(s, mm, ip, flags);
trace_seq_putc(s, '\n');
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index a347faced959..dbef0d135075 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -78,6 +78,8 @@ static const struct fetch_type probe_fetch_types[] = {
/* Special types */
__ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1,
"__data_loc char[]"),
+ __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1,
+ "__data_loc char[]"),
/* Basic types */
ASSIGN_FETCH_TYPE(u8, u8, 0),
ASSIGN_FETCH_TYPE(u16, u16, 0),
@@ -322,6 +324,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
{
struct fetch_insn *code = *pcode;
unsigned long param;
+ int deref = FETCH_OP_DEREF;
long offset = 0;
char *tmp;
int ret = 0;
@@ -394,9 +397,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
case '+': /* deref memory */
- arg++; /* Skip '+', because kstrtol() rejects it. */
- /* fall through */
case '-':
+ if (arg[1] == 'u') {
+ deref = FETCH_OP_UDEREF;
+ arg[1] = arg[0];
+ arg++;
+ }
+ if (arg[0] == '+')
+ arg++; /* Skip '+', because kstrtol() rejects it. */
tmp = strchr(arg, '(');
if (!tmp) {
trace_probe_log_err(offs, DEREF_NEED_BRACE);
@@ -432,7 +440,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
}
*pcode = code;
- code->op = FETCH_OP_DEREF;
+ code->op = deref;
code->offset = offset;
}
break;
@@ -569,15 +577,17 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
goto fail;
/* Store operation */
- if (!strcmp(parg->type->name, "string")) {
- if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
- code->op != FETCH_OP_COMM) {
+ if (!strcmp(parg->type->name, "string") ||
+ !strcmp(parg->type->name, "ustring")) {
+ if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
ret = -EINVAL;
goto fail;
}
- if (code->op != FETCH_OP_DEREF || parg->count) {
+ if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||
+ parg->count) {
/*
* IMM and COMM is pointing actual address, those must
* be kept, and if parg->count != 0, this is an array
@@ -590,12 +600,20 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
goto fail;
}
}
- code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */
+ /* If op == DEREF, replace it with STRING */
+ if (!strcmp(parg->type->name, "ustring") ||
+ code->op == FETCH_OP_UDEREF)
+ code->op = FETCH_OP_ST_USTRING;
+ else
+ code->op = FETCH_OP_ST_STRING;
code->size = parg->type->size;
parg->dynamic = true;
} else if (code->op == FETCH_OP_DEREF) {
code->op = FETCH_OP_ST_MEM;
code->size = parg->type->size;
+ } else if (code->op == FETCH_OP_UDEREF) {
+ code->op = FETCH_OP_ST_UMEM;
+ code->size = parg->type->size;
} else {
code++;
if (code->op != FETCH_OP_NOP) {
@@ -618,7 +636,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
- scode->op != FETCH_OP_ST_STRING) {
+ scode->op != FETCH_OP_ST_STRING &&
+ scode->op != FETCH_OP_ST_USTRING) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
ret = -EINVAL;
@@ -825,6 +844,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
{
+ struct trace_event_call *call = trace_probe_event_call(tp);
int len;
char *print_fmt;
@@ -836,7 +856,7 @@ int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
/* Second: actually write the @print_fmt */
__set_print_fmt(tp, print_fmt, len + 1, is_return);
- tp->call.print_fmt = print_fmt;
+ call->print_fmt = print_fmt;
return 0;
}
@@ -865,3 +885,105 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call,
}
return 0;
}
+
+
+void trace_probe_cleanup(struct trace_probe *tp)
+{
+ struct trace_event_call *call = trace_probe_event_call(tp);
+ int i;
+
+ for (i = 0; i < tp->nr_args; i++)
+ traceprobe_free_probe_arg(&tp->args[i]);
+
+ kfree(call->class->system);
+ kfree(call->name);
+ kfree(call->print_fmt);
+}
+
+int trace_probe_init(struct trace_probe *tp, const char *event,
+ const char *group)
+{
+ struct trace_event_call *call = trace_probe_event_call(tp);
+
+ if (!event || !group)
+ return -EINVAL;
+
+ call->class = &tp->class;
+ call->name = kstrdup(event, GFP_KERNEL);
+ if (!call->name)
+ return -ENOMEM;
+
+ tp->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->class.system) {
+ kfree(call->name);
+ call->name = NULL;
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&tp->files);
+ INIT_LIST_HEAD(&tp->class.fields);
+
+ return 0;
+}
+
+int trace_probe_register_event_call(struct trace_probe *tp)
+{
+ struct trace_event_call *call = trace_probe_event_call(tp);
+ int ret;
+
+ ret = register_trace_event(&call->event);
+ if (!ret)
+ return -ENODEV;
+
+ ret = trace_add_event_call(call);
+ if (ret)
+ unregister_trace_event(&call->event);
+
+ return ret;
+}
+
+int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file)
+{
+ struct event_file_link *link;
+
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ link->file = file;
+ INIT_LIST_HEAD(&link->list);
+ list_add_tail_rcu(&link->list, &tp->files);
+ trace_probe_set_flag(tp, TP_FLAG_TRACE);
+ return 0;
+}
+
+struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
+ struct trace_event_file *file)
+{
+ struct event_file_link *link;
+
+ trace_probe_for_each_link(link, tp) {
+ if (link->file == file)
+ return link;
+ }
+
+ return NULL;
+}
+
+int trace_probe_remove_file(struct trace_probe *tp,
+ struct trace_event_file *file)
+{
+ struct event_file_link *link;
+
+ link = trace_probe_get_file_link(tp, file);
+ if (!link)
+ return -ENOENT;
+
+ list_del_rcu(&link->list);
+ synchronize_rcu();
+ kfree(link);
+
+ if (list_empty(&tp->files))
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f9a8c632188b..d1714820efe1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -55,7 +55,6 @@
/* Flags for trace_probe */
#define TP_FLAG_TRACE 1
#define TP_FLAG_PROFILE 2
-#define TP_FLAG_REGISTERED 4
/* data_loc: data location, compatible with u32 */
#define make_data_loc(len, offs) \
@@ -92,10 +91,13 @@ enum fetch_op {
FETCH_OP_FOFFS, /* File offset: .immediate */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
+ FETCH_OP_UDEREF, /* User-space Dereference: .offset */
// Stage 3 (store) ops
FETCH_OP_ST_RAW, /* Raw: .size */
FETCH_OP_ST_MEM, /* Mem: .offset, .size */
+ FETCH_OP_ST_UMEM, /* Mem: .offset, .size */
FETCH_OP_ST_STRING, /* String: .offset, .size */
+ FETCH_OP_ST_USTRING, /* User String: .offset, .size */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -235,16 +237,71 @@ struct event_file_link {
struct list_head list;
};
+static inline bool trace_probe_test_flag(struct trace_probe *tp,
+ unsigned int flag)
+{
+ return !!(tp->flags & flag);
+}
+
+static inline void trace_probe_set_flag(struct trace_probe *tp,
+ unsigned int flag)
+{
+ tp->flags |= flag;
+}
+
+static inline void trace_probe_clear_flag(struct trace_probe *tp,
+ unsigned int flag)
+{
+ tp->flags &= ~flag;
+}
+
static inline bool trace_probe_is_enabled(struct trace_probe *tp)
{
- return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+ return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE);
}
-static inline bool trace_probe_is_registered(struct trace_probe *tp)
+static inline const char *trace_probe_name(struct trace_probe *tp)
{
- return !!(tp->flags & TP_FLAG_REGISTERED);
+ return trace_event_name(&tp->call);
}
+static inline const char *trace_probe_group_name(struct trace_probe *tp)
+{
+ return tp->call.class->system;
+}
+
+static inline struct trace_event_call *
+ trace_probe_event_call(struct trace_probe *tp)
+{
+ return &tp->call;
+}
+
+static inline int trace_probe_unregister_event_call(struct trace_probe *tp)
+{
+ /* tp->event is unregistered in trace_remove_event_call() */
+ return trace_remove_event_call(&tp->call);
+}
+
+static inline bool trace_probe_has_single_file(struct trace_probe *tp)
+{
+ return !!list_is_singular(&tp->files);
+}
+
+int trace_probe_init(struct trace_probe *tp, const char *event,
+ const char *group);
+void trace_probe_cleanup(struct trace_probe *tp);
+int trace_probe_register_event_call(struct trace_probe *tp);
+int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file);
+int trace_probe_remove_file(struct trace_probe *tp,
+ struct trace_event_file *file);
+struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
+ struct trace_event_file *file);
+
+#define trace_probe_for_each_link(pos, tp) \
+ list_for_each_entry(pos, &(tp)->files, list)
+#define trace_probe_for_each_link_rcu(pos, tp) \
+ list_for_each_entry_rcu(pos, &(tp)->files, list)
+
/* Check the name is good for event/group/fields */
static inline bool is_good_name(const char *name)
{
@@ -257,18 +314,6 @@ static inline bool is_good_name(const char *name)
return true;
}
-static inline struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
-{
- struct event_file_link *link;
-
- list_for_each_entry(link, &tp->files, list)
- if (link->file == file)
- return link;
-
- return NULL;
-}
-
#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index c30c61f12ddd..e5282828f4a6 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -59,8 +59,13 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs,
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base);
+static nokprobe_inline int fetch_store_strlen_user(unsigned long addr);
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base);
static nokprobe_inline int
probe_mem_read(void *dest, void *src, size_t size);
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size);
/* From the 2nd stage, routine is same */
static nokprobe_inline int
@@ -74,14 +79,21 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
stage2:
/* 2nd stage: dereference memory if needed */
- while (code->op == FETCH_OP_DEREF) {
- lval = val;
- ret = probe_mem_read(&val, (void *)val + code->offset,
- sizeof(val));
+ do {
+ if (code->op == FETCH_OP_DEREF) {
+ lval = val;
+ ret = probe_mem_read(&val, (void *)val + code->offset,
+ sizeof(val));
+ } else if (code->op == FETCH_OP_UDEREF) {
+ lval = val;
+ ret = probe_mem_read_user(&val,
+ (void *)val + code->offset, sizeof(val));
+ } else
+ break;
if (ret)
return ret;
code++;
- }
+ } while (1);
s3 = code;
stage3:
@@ -91,6 +103,10 @@ stage3:
ret = fetch_store_strlen(val + code->offset);
code++;
goto array;
+ } else if (code->op == FETCH_OP_ST_USTRING) {
+ ret += fetch_store_strlen_user(val + code->offset);
+ code++;
+ goto array;
} else
return -EILSEQ;
}
@@ -102,10 +118,17 @@ stage3:
case FETCH_OP_ST_MEM:
probe_mem_read(dest, (void *)val + code->offset, code->size);
break;
+ case FETCH_OP_ST_UMEM:
+ probe_mem_read_user(dest, (void *)val + code->offset, code->size);
+ break;
case FETCH_OP_ST_STRING:
loc = *(u32 *)dest;
ret = fetch_store_string(val + code->offset, dest, base);
break;
+ case FETCH_OP_ST_USTRING:
+ loc = *(u32 *)dest;
+ ret = fetch_store_string_user(val + code->offset, dest, base);
+ break;
default:
return -EILSEQ;
}
@@ -123,7 +146,8 @@ array:
total += ret;
if (++i < code->param) {
code = s3;
- if (s3->op != FETCH_OP_ST_STRING) {
+ if (s3->op != FETCH_OP_ST_STRING &&
+ s3->op != FETCH_OP_ST_USTRING) {
dest += s3->size;
val += s3->size;
goto stage3;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7860e3f59fad..1ceedb9146b1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -140,6 +140,13 @@ probe_mem_read(void *dest, void *src, size_t size)
return copy_from_user(dest, vaddr, size) ? -EFAULT : 0;
}
+
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ return probe_mem_read(dest, src, size);
+}
+
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
@@ -176,6 +183,12 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
return ret;
}
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ return fetch_store_string(addr, dest, base);
+}
+
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
@@ -191,6 +204,12 @@ fetch_store_strlen(unsigned long addr)
return (len > MAX_STRING_SIZE) ? 0 : len;
}
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ return fetch_store_strlen(addr);
+}
+
static unsigned long translate_user_vaddr(unsigned long file_offset)
{
unsigned long base_addr;
@@ -270,8 +289,8 @@ static bool trace_uprobe_match(const char *system, const char *event,
{
struct trace_uprobe *tu = to_trace_uprobe(ev);
- return strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
- (!system || strcmp(tu->tp.call.class->system, system) == 0);
+ return strcmp(trace_probe_name(&tu->tp), event) == 0 &&
+ (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0);
}
/*
@@ -281,25 +300,17 @@ static struct trace_uprobe *
alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
{
struct trace_uprobe *tu;
-
- if (!event || !group)
- return ERR_PTR(-EINVAL);
+ int ret;
tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
if (!tu)
return ERR_PTR(-ENOMEM);
- tu->tp.call.class = &tu->tp.class;
- tu->tp.call.name = kstrdup(event, GFP_KERNEL);
- if (!tu->tp.call.name)
- goto error;
-
- tu->tp.class.system = kstrdup(group, GFP_KERNEL);
- if (!tu->tp.class.system)
+ ret = trace_probe_init(&tu->tp, event, group);
+ if (ret < 0)
goto error;
dyn_event_init(&tu->devent, &trace_uprobe_ops);
- INIT_LIST_HEAD(&tu->tp.files);
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
@@ -307,25 +318,18 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
return tu;
error:
- kfree(tu->tp.call.name);
kfree(tu);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
static void free_trace_uprobe(struct trace_uprobe *tu)
{
- int i;
-
if (!tu)
return;
- for (i = 0; i < tu->tp.nr_args; i++)
- traceprobe_free_probe_arg(&tu->tp.args[i]);
-
path_put(&tu->path);
- kfree(tu->tp.call.class->system);
- kfree(tu->tp.call.name);
+ trace_probe_cleanup(&tu->tp);
kfree(tu->filename);
kfree(tu);
}
@@ -336,8 +340,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
struct trace_uprobe *tu;
for_each_trace_uprobe(tu, pos)
- if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
- strcmp(tu->tp.call.class->system, group) == 0)
+ if (strcmp(trace_probe_name(&tu->tp), event) == 0 &&
+ strcmp(trace_probe_group_name(&tu->tp), group) == 0)
return tu;
return NULL;
@@ -372,8 +376,8 @@ static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new)
struct trace_uprobe *tmp, *old = NULL;
struct inode *new_inode = d_real_inode(new->path.dentry);
- old = find_probe_event(trace_event_name(&new->tp.call),
- new->tp.call.class->system);
+ old = find_probe_event(trace_probe_name(&new->tp),
+ trace_probe_group_name(&new->tp));
for_each_trace_uprobe(tmp, pos) {
if ((old ? old != tmp : true) &&
@@ -578,6 +582,10 @@ static int trace_uprobe_create(int argc, const char **argv)
goto error;
}
+ ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu));
+ if (ret < 0)
+ goto error;
+
ret = register_trace_uprobe(tu);
if (!ret)
goto out;
@@ -621,8 +629,8 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev)
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system,
- trace_event_name(&tu->tp.call), tu->filename,
+ seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp),
+ trace_probe_name(&tu->tp), tu->filename,
(int)(sizeof(void *) * 2), tu->offset);
if (tu->ref_ctr_offset)
@@ -692,7 +700,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
tu = to_trace_uprobe(ev);
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- trace_event_name(&tu->tp.call), tu->nhit);
+ trace_probe_name(&tu->tp), tu->nhit);
return 0;
}
@@ -818,7 +826,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
struct ring_buffer *buffer;
void *data;
int size, esize;
- struct trace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tu->tp);
WARN_ON(call != trace_file->event_call);
@@ -860,7 +868,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
return 0;
rcu_read_lock();
- list_for_each_entry_rcu(link, &tu->tp.files, list)
+ trace_probe_for_each_link_rcu(link, &tu->tp)
__uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
rcu_read_unlock();
@@ -874,7 +882,7 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct event_file_link *link;
rcu_read_lock();
- list_for_each_entry_rcu(link, &tu->tp.files, list)
+ trace_probe_for_each_link_rcu(link, &tu->tp)
__uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
rcu_read_unlock();
}
@@ -893,12 +901,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
if (is_ret_probe(tu)) {
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- trace_event_name(&tu->tp.call),
+ trace_probe_name(&tu->tp),
entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
trace_seq_printf(s, "%s: (0x%lx)",
- trace_event_name(&tu->tp.call),
+ trace_probe_name(&tu->tp),
entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -921,26 +929,20 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
filter_func_t filter)
{
bool enabled = trace_probe_is_enabled(&tu->tp);
- struct event_file_link *link = NULL;
int ret;
if (file) {
- if (tu->tp.flags & TP_FLAG_PROFILE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
return -EINTR;
- link = kmalloc(sizeof(*link), GFP_KERNEL);
- if (!link)
- return -ENOMEM;
-
- link->file = file;
- list_add_tail_rcu(&link->list, &tu->tp.files);
-
- tu->tp.flags |= TP_FLAG_TRACE;
+ ret = trace_probe_add_file(&tu->tp, file);
+ if (ret < 0)
+ return ret;
} else {
- if (tu->tp.flags & TP_FLAG_TRACE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
return -EINTR;
- tu->tp.flags |= TP_FLAG_PROFILE;
+ trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE);
}
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
@@ -970,13 +972,11 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
uprobe_buffer_disable();
err_flags:
- if (file) {
- list_del(&link->list);
- kfree(link);
- tu->tp.flags &= ~TP_FLAG_TRACE;
- } else {
- tu->tp.flags &= ~TP_FLAG_PROFILE;
- }
+ if (file)
+ trace_probe_remove_file(&tu->tp, file);
+ else
+ trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
+
return ret;
}
@@ -987,26 +987,18 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
return;
if (file) {
- struct event_file_link *link;
-
- link = find_event_file_link(&tu->tp, file);
- if (!link)
+ if (trace_probe_remove_file(&tu->tp, file) < 0)
return;
- list_del_rcu(&link->list);
- /* synchronize with u{,ret}probe_trace_func */
- synchronize_rcu();
- kfree(link);
-
- if (!list_empty(&tu->tp.files))
+ if (trace_probe_is_enabled(&tu->tp))
return;
- }
+ } else
+ trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->inode = NULL;
- tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
uprobe_buffer_disable();
}
@@ -1126,7 +1118,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
{
- struct trace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
struct hlist_head *head;
void *data;
@@ -1279,11 +1271,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
ucb = uprobe_buffer_get();
store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
- if (tu->tp.flags & TP_FLAG_TRACE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
ret |= uprobe_trace_func(tu, regs, ucb, dsize);
#ifdef CONFIG_PERF_EVENTS
- if (tu->tp.flags & TP_FLAG_PROFILE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
ret |= uprobe_perf_func(tu, regs, ucb, dsize);
#endif
uprobe_buffer_put(ucb);
@@ -1314,11 +1306,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
ucb = uprobe_buffer_get();
store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
- if (tu->tp.flags & TP_FLAG_TRACE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
uretprobe_trace_func(tu, func, regs, ucb, dsize);
#ifdef CONFIG_PERF_EVENTS
- if (tu->tp.flags & TP_FLAG_PROFILE)
+ if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
uretprobe_perf_func(tu, func, regs, ucb, dsize);
#endif
uprobe_buffer_put(ucb);
@@ -1329,10 +1321,10 @@ static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
-static inline void init_trace_event_call(struct trace_uprobe *tu,
- struct trace_event_call *call)
+static inline void init_trace_event_call(struct trace_uprobe *tu)
{
- INIT_LIST_HEAD(&call->class->fields);
+ struct trace_event_call *call = trace_probe_event_call(&tu->tp);
+
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
@@ -1343,43 +1335,14 @@ static inline void init_trace_event_call(struct trace_uprobe *tu,
static int register_uprobe_event(struct trace_uprobe *tu)
{
- struct trace_event_call *call = &tu->tp.call;
- int ret = 0;
-
- init_trace_event_call(tu, call);
-
- if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
- return -ENOMEM;
+ init_trace_event_call(tu);
- ret = register_trace_event(&call->event);
- if (!ret) {
- kfree(call->print_fmt);
- return -ENODEV;
- }
-
- ret = trace_add_event_call(call);
-
- if (ret) {
- pr_info("Failed to register uprobe event: %s\n",
- trace_event_name(call));
- kfree(call->print_fmt);
- unregister_trace_event(&call->event);
- }
-
- return ret;
+ return trace_probe_register_event_call(&tu->tp);
}
static int unregister_uprobe_event(struct trace_uprobe *tu)
{
- int ret;
-
- /* tu->event is unregistered in trace_remove_event_call() */
- ret = trace_remove_event_call(&tu->tp.call);
- if (ret)
- return ret;
- kfree(tu->tp.call.print_fmt);
- tu->tp.call.print_fmt = NULL;
- return 0;
+ return trace_probe_unregister_event_call(&tu->tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1419,14 +1382,14 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->path = path;
tu->ref_ctr_offset = ref_ctr_offset;
tu->filename = kstrdup(name, GFP_KERNEL);
- init_trace_event_call(tu, &tu->tp.call);
+ init_trace_event_call(tu);
if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
ret = -ENOMEM;
goto error;
}
- return &tu->tp.call;
+ return trace_probe_event_call(&tu->tp);
error:
free_trace_uprobe(tu);
return ERR_PTR(ret);
@@ -1438,9 +1401,6 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
tu = container_of(event_call, struct trace_uprobe, tp.call);
- kfree(tu->tp.call.print_fmt);
- tu->tp.call.print_fmt = NULL;
-
free_trace_uprobe(tu);
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index df3ade14ccbd..73956eaff8a9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -55,8 +55,8 @@ struct tp_probes {
static inline void *allocate_probes(int count)
{
- struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
- + sizeof(struct tp_probes), GFP_KERNEL);
+ struct tp_probes *p = kmalloc(struct_size(p, probes, count),
+ GFP_KERNEL);
return p == NULL ? NULL : p->probes;
}
diff --git a/kernel/ucount.c b/kernel/ucount.c
index feb128c7b5d9..a53cc2b4179c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,16 +52,14 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-static int zero = 0;
-static int int_max = INT_MAX;
#define UCOUNT_ENTRY(name) \
{ \
.procname = name, \
.maxlen = sizeof(int), \
.mode = 0644, \
.proc_handler = proc_dointvec_minmax, \
- .extra1 = &zero, \
- .extra2 = &int_max, \
+ .extra1 = SYSCTL_ZERO, \
+ .extra2 = SYSCTL_INT_MAX, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),