From 1b038c6e05ff70a1e66e3e571c2e6106bdb75f53 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Fri, 17 May 2019 13:52:31 +0200 Subject: perf/ring_buffer: Fix exposing a temporarily decreased data_head In perf_output_put_handle(), an IRQ/NMI can happen in below location and write records to the same ring buffer: ... local_dec_and_test(&rb->nest) ... <-- an IRQ/NMI can happen here rb->user_page->data_head = head; ... In this case, a value A is written to data_head in the IRQ, then a value B is written to data_head after the IRQ. And A > B. As a result, data_head is temporarily decreased from A to B. And a reader may see data_head < data_tail if it read the buffer frequently enough, which creates unexpected behaviors. This can be fixed by moving dec(&rb->nest) to after updating data_head, which prevents the IRQ/NMI above from updating data_head. [ Split up by peterz. ] Signed-off-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mark.rutland@arm.com Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.224478157@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 674b35383491..009467a60578 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -51,11 +51,18 @@ again: head = local_read(&rb->head); /* - * IRQ/NMI can happen here, which means we can miss a head update. + * IRQ/NMI can happen here and advance @rb->head, causing our + * load above to be stale. */ - if (!local_dec_and_test(&rb->nest)) + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + if (local_read(&rb->nest) > 1) { + local_dec(&rb->nest); goto out; + } /* * Since the mmap() consumer (userspace) can run on a different CPU: @@ -87,9 +94,18 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * We must publish the head before decrementing the nest count, + * otherwise an IRQ/NMI can publish a more recent head value and our + * write will (temporarily) publish a stale value. + */ + barrier(); + local_set(&rb->nest, 0); + + /* + * Ensure we decrement @rb->nest before we validate the @rb->head. + * Otherwise we cannot be sure we caught the 'last' nested update. */ + barrier(); if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); goto again; -- cgit v1.2.3 From 3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:32 +0200 Subject: perf/ring_buffer: Add ordering to rb->nest increment Similar to how decrementing rb->next too early can cause data_head to (temporarily) be observed to go backward, so too can this happen when we increment too late. This barrier() ensures the rb->head load happens after the increment, both the one in the 'goto again' path, as the one from perf_output_get_handle() -- albeit very unlikely to matter for the latter. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.309516009@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 009467a60578..4b5f8d932400 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle *handle) unsigned long head; again: + /* + * In order to avoid publishing a head value that goes backwards, + * we must ensure the load of @rb->head happens after we've + * incremented @rb->nest. + * + * Otherwise we can observe a @rb->head value before one published + * by an IRQ/NMI happening between the load and the increment. + */ + barrier(); head = local_read(&rb->head); /* -- cgit v1.2.3 From 4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:33 +0200 Subject: perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data We must use {READ,WRITE}_ONCE() on rb->user_page data such that concurrent usage will see whole values. A few key sites were missing this. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1") Link: http://lkml.kernel.org/r/20190517115418.394192145@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4b5f8d932400..7a0c73e4b3eb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -100,7 +100,7 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, @@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; @@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; -- cgit v1.2.3 From 5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:34 +0200 Subject: perf/ring-buffer: Use regular variables for nesting While the IRQ/NMI will nest, the nest-count will be invariant over the actual exception, since it will decrement equal to increment. This means we can -- carefully -- use a regular variable since the typical LOAD-STORE race doesn't exist (similar to preempt_count). This optimizes the ring-buffer for all LOAD-STORE architectures, since they need to use atomic ops to implement local_t. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Cc: yabinc@google.com Link: http://lkml.kernel.org/r/20190517115418.481392777@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -24,7 +24,7 @@ struct ring_buffer { atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ - local_t nest; /* nested writers */ + unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ @@ -41,7 +41,7 @@ struct ring_buffer { /* AUX area */ long aux_head; - local_t aux_nest; + unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7a0c73e4b3eb..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) struct ring_buffer *rb = handle->rb; preempt_disable(); - local_inc(&rb->nest); + + /* + * Avoid an explicit LOAD/STORE such that architectures with memops + * can use them. + */ + (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } @@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; + unsigned int nest; + + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + nest = READ_ONCE(rb->nest); + if (nest > 1) { + WRITE_ONCE(rb->nest, nest - 1); + goto out; + } again: /* @@ -64,15 +80,6 @@ again: * load above to be stale. */ - /* - * If this isn't the outermost nesting, we don't have to update - * @rb->user_page->data_head. - */ - if (local_read(&rb->nest) > 1) { - local_dec(&rb->nest); - goto out; - } - /* * Since the mmap() consumer (userspace) can run on a different CPU: * @@ -108,7 +115,7 @@ again: * write will (temporarily) publish a stale value. */ barrier(); - local_set(&rb->nest, 0); + WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. @@ -116,7 +123,7 @@ again: */ barrier(); if (unlikely(head != local_read(&rb->head))) { - local_inc(&rb->nest); + WRITE_ONCE(rb->nest, 1); goto again; } @@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; + unsigned int nest; if (output_event->parent) output_event = output_event->parent; @@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; + nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ - if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + if (WARN_ON_ONCE(nest)) goto err_put; + WRITE_ONCE(rb->aux_nest, nest + 1); + aux_head = rb->aux_head; handle->rb = rb; @@ -419,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!handle->size) { /* A, matches D */ event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } @@ -508,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->event = NULL; - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); -- cgit v1.2.3