diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 156 | ||||
-rw-r--r-- | kernel/events/internal.h | 10 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 104 |
3 files changed, 230 insertions, 40 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 2c78b6f47339..21ba024c9ed1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1926,8 +1926,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2359,6 +2364,29 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +static int __perf_event_stop(void *info) +{ + struct perf_event *event = info; + + /* for AUX events, our job is done if the event is already inactive */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + return 0; +} + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -4352,6 +4380,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@ -4668,6 +4709,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4695,10 +4738,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5769,6 +5824,80 @@ next: rcu_read_unlock(); } +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(event); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -6500,10 +6629,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7694,6 +7820,15 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + if (WARN_ON_ONCE(hw_context_taken)) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@ -8015,8 +8150,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else { + event->overflow_handler = perf_event_output; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 4199b6d193f5..05f9f6d626df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,13 +11,13 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ + int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ @@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } +static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +{ + if (!pause && rb->nr_pages) + rb->paused = 0; + else + rb->paused = 1; +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c61f0cbd308b..60be55a64040 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -102,8 +102,21 @@ out: preempt_enable(); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) +static bool __always_inline +ring_buffer_has_space(unsigned long head, unsigned long tail, + unsigned long data_size, unsigned int size, + bool backward) +{ + if (!backward) + return CIRC_SPACE(head, tail, data_size) >= size; + else + return CIRC_SPACE(tail, head, data_size) >= size; +} + +static int __always_inline +__perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(!rb)) goto out; - if (unlikely(!rb->nr_pages)) + if (unlikely(rb->paused)) { + if (rb->nr_pages) + local_inc(&rb->lost); goto out; + } handle->rb = rb; handle->event = event; @@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; + if (!rb->overwrite) { + if (unlikely(!ring_buffer_has_space(head, tail, + perf_data_size(rb), + size, backward))) + goto fail; + } /* * The above forms a control dependency barrier separating the @@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle, * See perf_output_put_handle(). */ - head += size; + if (!backward) + head += size; + else + head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + if (backward) { + offset = head; + head = (u64)(-head); + } + /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. @@ -203,6 +230,12 @@ out: return -ENOSPC; } +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { @@ -221,8 +254,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -243,16 +274,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); + /* + * perf_output_begin() only checks rb->paused, therefore + * rb->paused must be true if we have no pages for output. + */ + if (!rb->nr_pages) + rb->paused = 1; } /* @@ -264,6 +292,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -288,6 +320,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, goto err; /* + * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), + * the aux buffer is in perf_mmap_close(), about to get freed. + */ + if (!atomic_read(&rb->aux_mmap_count)) + goto err_put; + + /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ @@ -328,10 +367,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -342,6 +382,10 @@ err: * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) @@ -381,8 +425,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -463,6 +508,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -574,18 +627,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC |