summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c7
-rw-r--r--include/linux/perf_event.h7
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/events/core.c428
-rw-r--r--kernel/trace/bpf_trace.c14
5 files changed, 233 insertions, 244 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a667078a5180..fed2ab1f1065 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1960,7 +1960,8 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx, u64 config)
{
- int alt_idx;
+ int alt_idx = idx;
+
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
@@ -2897,14 +2898,12 @@ static void intel_pmu_cpu_starting(int cpu)
return;
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
- void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- *onln = cpuc->shared_regs;
+ cpuc->kfree_on_online[0] = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6612732d8fd0..b35a61a481fa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,9 +634,6 @@ struct perf_event_context {
int nr_cgroups; /* cgroup evts */
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
-
- struct delayed_work orphans_remove;
- bool orphans_remove_sched;
};
/*
@@ -729,7 +726,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
-extern struct perf_event *perf_event_get(unsigned int fd);
+extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
@@ -1070,7 +1067,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
-static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
+static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..89ebbc4d1164 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
struct perf_event *event;
const struct perf_event_attr *attr;
+ struct file *file;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ file = perf_event_get(fd);
+ if (IS_ERR(file))
+ return file;
+
+ event = file->private_data;
attr = perf_event_attrs(event);
if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
goto err;
if (attr->type == PERF_TYPE_RAW)
- return event;
+ return file;
if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ return file;
if (attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
+ return file;
err:
- perf_event_release_kernel(event);
+ fput(file);
return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
-
- perf_event_release_kernel(event);
+ fput((struct file *)ptr);
}
static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9de4d352ba8c..f1e53e8d4ae2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -152,7 +150,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
static bool is_kernel_event(struct perf_event *event)
{
- return event->owner == TASK_TOMBSTONE;
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
/*
@@ -1041,9 +1039,8 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
@@ -1086,8 +1083,8 @@ static void put_ctx(struct perf_event_context *ctx)
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1230,9 +1227,9 @@ retry:
!atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
-
- WARN_ON_ONCE(ctx->task != task);
}
rcu_read_unlock();
if (!ctx)
@@ -1646,45 +1643,11 @@ out:
perf_event__header_size(tmp);
}
-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
+ return event->state == PERF_EVENT_STATE_EXIT;
}
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1745,9 +1708,6 @@ event_sched_out(struct perf_event *event,
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}
@@ -1771,6 +1731,9 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+#define DETACH_GROUP 0x01UL
+#define DETACH_STATE 0x02UL
+
/*
* Cross CPU call to remove a performance event
*
@@ -1783,12 +1746,14 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
- bool detach_group = (unsigned long)info;
+ unsigned long flags = (unsigned long)info;
event_sched_out(event, cpuctx, ctx);
- if (detach_group)
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_STATE)
+ event->state = PERF_EVENT_STATE_EXIT;
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
@@ -1809,12 +1774,11 @@ __perf_remove_from_context(struct perf_event *event,
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
lockdep_assert_held(&event->ctx->mutex);
- event_function_call(event, __perf_remove_from_context,
- (void *)(unsigned long)detach_group);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
@@ -1846,7 +1810,8 @@ static void __perf_event_disable(struct perf_event *event,
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
@@ -1980,9 +1945,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);
@@ -2253,7 +2215,8 @@ static void __perf_event_enable(struct perf_event *event,
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
return;
update_context_time(ctx);
@@ -2298,7 +2261,8 @@ static void _perf_event_enable(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -3363,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3590,7 +3553,7 @@ static void unaccount_event(struct perf_event *event)
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
@@ -3665,29 +3628,6 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3709,7 +3649,25 @@ static void _free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}
/*
@@ -3736,14 +3694,13 @@ static void perf_remove_from_owner(struct perf_event *event)
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = lockless_dereference(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -3771,8 +3728,10 @@ static void perf_remove_from_owner(struct perf_event *event)
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3780,36 +3739,98 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *child, *tmp;
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
+ perf_event_ctx_unlock(event, ctx);
+
/*
- * There are two ways this annotation is useful:
+ * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+ * either from the above perf_remove_from_context() or through
+ * perf_event_exit_event().
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Therefore, anybody acquiring event->child_mutex after the below
+ * loop _must_ also see this, most importantly inherit_event() which
+ * will avoid placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
- perf_event_ctx_unlock(event, ctx);
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
-int perf_event_release_kernel(struct perf_event *event)
-{
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
+
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+ /* Must be the last reference */
put_event(event);
return 0;
}
@@ -3820,46 +3841,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -4088,7 +4073,7 @@ static void _perf_event_reset(struct perf_event *event)
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
@@ -8432,11 +8417,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -8489,6 +8474,8 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__header_size(event);
perf_event__id_header_size(event);
+ event->owner = current;
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -8498,8 +8485,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_online_cpus();
- event->owner = current;
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -8616,7 +8601,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8683,33 +8668,15 @@ static void sync_child_event(struct perf_event *child_event,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8725,37 +8692,70 @@ __perf_event_exit_task(struct perf_event *child_event,
raw_spin_lock_irq(&child_ctx->lock);
WARN_ON_ONCE(child_ctx->is_active);
- if (!!child_event->parent)
+ if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!parent_event) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * Child events can be cleaned up.
+ */
+
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll() for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- unsigned long flags;
WARN_ON_ONCE(child != current);
- child_ctx = perf_lock_task_context(child, ctxn, &flags);
+ child_ctx = perf_pin_task_context(child, ctxn);
if (!child_ctx)
return;
+ /*
+ * In order to reduce the amount of tricky in ctx tear-down, we hold
+ * ctx::mutex over the entire thing. This serializes against almost
+ * everything that wants to access the ctx.
+ *
+ * The exception is sys_perf_event_open() /
+ * perf_event_create_kernel_count() which does find_get_context()
+ * without ctx::mutex (it cannot because of the move_group double mutex
+ * lock thing). See the comments in perf_install_in_context().
+ */
+ mutex_lock(&child_ctx->mutex);
+
+ /*
+ * In a single ctx::lock section, de-schedule the events and detach the
+ * context from the task such that we cannot ever get it scheduled back
+ * in.
+ */
+ raw_spin_lock_irq(&child_ctx->lock);
task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
/*
@@ -8767,14 +8767,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
- /*
- * If this context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the events from it.
- */
clone_ctx = unclone_ctx(child_ctx);
- update_context_time(child_ctx);
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+ raw_spin_unlock_irq(&child_ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -8786,20 +8780,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);
- /*
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
- */
- mutex_lock(&child_ctx->mutex);
-
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
@@ -8824,8 +8806,7 @@ void perf_event_exit_task(struct task_struct *child)
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
mutex_unlock(&child->perf_event_mutex);
@@ -8908,21 +8889,20 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
+ struct file *file;
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ file = fget_raw(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -8965,8 +8945,16 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9014,8 +9002,6 @@ inherit_event(struct perf_event *parent_event,
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
@@ -9239,7 +9225,7 @@ static void __perf_event_exit_context(void *__info)
raw_spin_lock(&ctx->lock);
list_for_each_entry(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true);
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 45dd798bcd37..326a75e884db 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+ struct file *file;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (!event)
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
/* make sure event is local and doesn't have pmu::count */
if (event->oncpu != smp_processor_id() ||
event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
+ struct file *file;
struct perf_raw_record raw = {
.size = size,
.data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (unlikely(!event))
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;