From 0f4bd46ec252887f44f1f065b41867cac8f70dfb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 22 Dec 2009 03:15:43 +0000 Subject: kmsg_dump: Dump on crash_kexec as well crash_kexec gets called before kmsg_dump(KMSG_DUMP_OOPS) if panic_on_oops is set, so the kernel log buffer is not stored for this case. This patch adds a KMSG_DUMP_KEXEC dump type which gets called when crash_kexec() is invoked. To avoid getting double dumps, the old KMSG_DUMP_PANIC is moved below crash_kexec(). The mtdoops driver is modified to handle KMSG_DUMP_KEXEC in the same way as a panic. Signed-off-by: KOSAKI Motohiro Acked-by: Simon Kagstrom Signed-off-by: David Woodhouse --- kernel/kexec.c | 4 ++++ kernel/panic.c | 3 ++- kernel/printk.c | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 433e9fcc1fc5..ae217488fef8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs) if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); diff --git a/kernel/panic.c b/kernel/panic.c index 5827f7b97254..c787333282b8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif - kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); + kmsg_dump(KMSG_DUMP_PANIC); + /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic diff --git a/kernel/printk.c b/kernel/printk.c index 1ded8e7dd19b..2c9dc0b03a5e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static const char const *kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", + [KMSG_DUMP_KEXEC] = "kexec", }; static const char *kmsg_to_str(enum kmsg_dump_reason reason) -- cgit v1.2.3 From 5ded3dc6a3c7549b36a8ac27bbd81b33756a2c29 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 6 Jan 2010 17:12:07 -0800 Subject: ring-buffer: Wrap a list.next reference with rb_list_head() This reference at the end of rb_get_reader_page() was causing off-by-one writes to the prev pointer of the page after the reader page when that page is the head page, and therefore the reader page has the RB_PAGE_HEAD flag in its list.next pointer. This eventually results in a GPF in a subsequent call to rb_set_head_page() (usually from rb_get_reader_page()) when that prev pointer is dereferenced. The dereferenced register would characteristically have an address that appears shifted left by one byte (eg, ffxxxxxxxxxxxxyy instead of ffffxxxxxxxxxxxx) due to being written at an address one byte too high. Signed-off-by: David Sharp LKML-Reference: <1262826727-9090-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..d5b7308b7e1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2906,7 +2906,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * * Now make the new head point back to the reader page. */ - reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ -- cgit v1.2.3 From 0e1ff5d72a6393f2ef5dbf74f58bb55a12d63834 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jan 2010 20:40:44 -0500 Subject: ring-buffer: Add rb_list_head() wrapper around new reader page next field If the very unlikely case happens where the writer moves the head by one between where the head page is read and where the new reader page is assigned _and_ the writer then writes and wraps the entire ring buffer so that the head page is back to what was originally read as the head page, the page to be swapped will have a corrupted next pointer. Simple solution is to wrap the assignment of the next pointer with a rb_list_head(). Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d5b7308b7e1b..edefe3b2801b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2869,7 +2869,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); - cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* -- cgit v1.2.3 From 7485d0d3758e8e6491a5c9468114e74dc050785d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 5 Jan 2010 16:32:43 +0900 Subject: futexes: Remove rw parameter from get_futex_key() Currently, futexes have two problem: A) The current futex code doesn't handle private file mappings properly. get_futex_key() uses PageAnon() to distinguish file and anon, which can cause the following bad scenario: 1) thread-A call futex(private-mapping, FUTEX_WAIT), it sleeps on file mapping object. 2) thread-B writes a variable and it makes it cow. 3) thread-B calls futex(private-mapping, FUTEX_WAKE), it wakes up blocked thread on the anonymous page. (but it's nothing) B) Current futex code doesn't handle zero page properly. Read mode get_user_pages() can return zero page, but current futex code doesn't handle it at all. Then, zero page makes infinite loop internally. The solution is to use write mode get_user_page() always for page lookup. It prevents the lookup of both file page of private mappings and zero page. Performance concerns: Probaly very little, because glibc always initialize variables for futex before to call futex(). It means glibc users never see the overhead of this patch. Compatibility concerns: This patch has few compatibility issues. After this patch, FUTEX_WAIT require writable access to futex variables (read-only mappings makes EFAULT). But practically it's not a problem, glibc always initalizes variables for futexes explicitly - nobody uses read-only mappings. Reported-by: Hugh Dickins Signed-off-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Cc: Linus Torvalds Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: Ulrich Drepper LKML-Reference: <20100105162633.45A2.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8e3c3ffe1b9a..d9b3a2228f9d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); + err = get_user_pages_fast(address, 1, 1, &page); if (err < 0) return err; @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1175,11 +1173,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, */ retry: q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) return ret; @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out; @@ -2023,7 +2020,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_waiter.task = NULL; key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; -- cgit v1.2.3 From 751e9983ee276cb150e8812b1d995f6035a63878 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:02 +0800 Subject: ftrace: Fix MATCH_END_ONLY function filter For '*foo' pattern, we should allow any string ending with 'foo', but ftrace filter incorrectly disallows strings like bar_foo_foo: # echo '*io' > set_ftrace_filter # cat set_ftrace_filter | grep 'req_bio_endio' # cat available_filter_functions | grep 'req_bio_endio' req_bio_endio Signed-off-by: Li Zefan LKML-Reference: <4B4E870E.6060607@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7968762c8167..1e6640f80454 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; - char *ptr; + int slen; switch (type) { case MATCH_FULL: @@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type) matched = 1; break; case MATCH_END_ONLY: - ptr = strstr(str, regex); - if (ptr && (ptr[len] == 0)) + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) matched = 1; break; } -- cgit v1.2.3 From 285caad415f459f336247932b4db95a571357a02 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:21 +0800 Subject: tracing/filters: Fix MATCH_FRONT_ONLY filter matching MATCH_FRONT_ONLY actually is a full matching: # ./perf record -R -f -a -e lock:lock_acquire \ --filter 'name ~rcu_*' sleep 1 # ./perf trace (no output) We should pass the length of the pattern string to strncmp(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8721.5090301@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..11c3973e6552 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -261,7 +261,7 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.3 From a3291c14ecf0a995e30d993b7f2cae031de98727 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:41 +0800 Subject: tracing/filters: Fix MATCH_END_ONLY filter matching For '*foo' pattern, we should allow any string ending with 'foo', but event filtering incorrectly disallows strings like bar_foo_foo: Signed-off-by: Li Zefan LKML-Reference: <4B4E8735.6070604@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 11c3973e6552..49e44dd17851 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -275,9 +275,10 @@ static int regex_match_middle(char *str, struct regex *r, int len) static int regex_match_end(char *str, struct regex *r, int len) { - char *ptr = strstr(str, r->pattern); + int strlen = len - 1; - if (ptr && (ptr[r->len] == 0)) + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.3 From b2af211f284eb1bef19fbb85fc8ef551bb1e7460 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:11 +0800 Subject: tracing/filters: Fix MATCH_MIDDLE_ONLY filter matching The @str might not be NULL-terminated if it's of type DYN_STRING or STATIC_STRING, so we should use strnstr() instead of strstr(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8753.2000102@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49e44dd17851..f364b085397e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -268,7 +268,7 @@ static int regex_match_front(char *str, struct regex *r, int len) static int regex_match_middle(char *str, struct regex *r, int len) { - if (strstr(str, r->pattern)) + if (strnstr(str, r->pattern, len)) return 1; return 0; } -- cgit v1.2.3 From 16da27a8bc7a0d050686d1b2e9efb53fab9ed226 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:27 +0800 Subject: tracing/filters: Fix MATCH_FULL filter matching for PTR_STRING MATCH_FULL matching for PTR_STRING is not working correctly: # echo 'func == vt' > events/bkl/lock_kernel/filter # echo 1 > events/bkl/lock_kernel/enable ... # cat trace Xorg-1484 [000] 1973.392586: lock_kernel: ... func=vt_ioctl() gpm-1402 [001] 1974.027740: lock_kernel: ... func=vt_ioctl() We should pass to regex.match(..., len) the length (including '\0') of the source string instead of the length of the pattern string. Signed-off-by: Li Zefan LKML-Reference: <4B4E8763.5070707@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f364b085397e..60c2a4efad4a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, { char **addr = (char **)(event + pred->offset); int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex.match(*addr, &pred->regex, len); match = cmp ^ pred->not; @@ -782,10 +783,8 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - else { + else fn = filter_pred_pchar; - pred->regex.field_len = strlen(pred->regex.pattern); - } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); -- cgit v1.2.3 From d1303dd1d6b220cab375f24fa91a5640e54e169e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:40 +0800 Subject: tracing/filters: Add comment for match callbacks We should be clear on 2 things: - the length parameter of a match callback includes tailing '\0'. - the string to be searched might not be NULL-terminated. Signed-off-by: Li Zefan LKML-Reference: <4B4E8770.7000608@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 60c2a4efad4a..e42af9aad69f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } -/* Basic regex callbacks */ +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + static int regex_match_full(char *str, struct regex *r, int len) { if (strncmp(str, r->pattern, len) == 0) -- cgit v1.2.3 From 8ecc2951534af10e04ddb5e5ff5c6d217b79f5c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:12 -0800 Subject: kfifo: use void * pointers for user buffers The pointers to user buffers are currently unsigned char *, which requires a lot of casting in the caller for any non-char typed buffers. Use void * instead. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 10 +++++----- kernel/kfifo.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index c4ac88b3c302..6fb495ea956a 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -104,15 +104,15 @@ union { \ #undef __kfifo_initializer -extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, +extern void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size); extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); extern unsigned int kfifo_in(struct kfifo *fifo, - const unsigned char *from, unsigned int len); + const void *from, unsigned int len); extern __must_check unsigned int kfifo_out(struct kfifo *fifo, - unsigned char *to, unsigned int len); + void *to, unsigned int len); /** * kfifo_reset - removes the entire FIFO contents @@ -194,7 +194,7 @@ static inline __must_check unsigned int kfifo_avail(struct kfifo *fifo) * bytes copied. */ static inline unsigned int kfifo_in_locked(struct kfifo *fifo, - const unsigned char *from, unsigned int n, spinlock_t *lock) + const void *from, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; @@ -219,7 +219,7 @@ static inline unsigned int kfifo_in_locked(struct kfifo *fifo, * @to buffer and returns the number of copied bytes. */ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, - unsigned char *to, unsigned int n, spinlock_t *lock) + void *to, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index e92d519f93b1..ab615e695052 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,7 +28,7 @@ #include #include -static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, +static void _kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { fifo->buffer = buffer; @@ -44,7 +44,7 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @size: the size of the internal buffer, this have to be a power of 2. * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) +void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); @@ -235,7 +235,7 @@ EXPORT_SYMBOL(__kfifo_in_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, +unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len) { len = min(kfifo_avail(fifo), len); @@ -277,7 +277,7 @@ EXPORT_SYMBOL(__kfifo_out_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) { len = min(kfifo_len(fifo), len); -- cgit v1.2.3 From 64ce1037c5434b1d036cd99ecaee6e00496bc2e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:15 -0800 Subject: kfifo: sanitize *_user error handling Right now for kfifo_*_user it's not easily possible to distingush between a user copy failing and the FIFO not containing enough data. The problem is that both conditions are multiplexed into the same return code. Avoid this by moving the "copy length" into a separate output parameter and only return 0/-EFAULT in the main return value. I didn't fully adapt the weird "record" variants, those seem to be unused anyways and were rather messy (should they be just removed?) I would appreciate some double checking if I did all the conversions correctly. Signed-off-by: Andi Kleen Cc: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 8 +++--- kernel/kfifo.c | 76 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 6fb495ea956a..86ad50a900c8 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -235,11 +235,11 @@ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, extern void kfifo_skip(struct kfifo *fifo, unsigned int len); -extern __must_check unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int n); +extern __must_check int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned *lenout); -extern __must_check unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int n); +extern __must_check int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int n, unsigned *lenout); /* * __kfifo_add_out internal helper function for updating the out offset diff --git a/kernel/kfifo.c b/kernel/kfifo.c index ab615e695052..b50bb622e8b0 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -159,8 +159,9 @@ static inline void __kfifo_out_data(struct kfifo *fifo, memcpy(to + l, fifo->buffer, len - l); } -static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off) +static inline int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned *lenout) { unsigned int l; int ret; @@ -177,16 +178,20 @@ static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); - - if (unlikely(ret)) - return ret + len - l; + if (unlikely(ret)) { + *lenout = ret; + return -EFAULT; + } + *lenout = l; /* then put the rest (if any) at the beginning of the buffer */ - return copy_from_user(fifo->buffer, from + l, len - l); + ret = copy_from_user(fifo->buffer, from + l, len - l); + *lenout += ret ? ret : len - l; + return ret ? -EFAULT : 0; } -static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off) +static inline int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off, unsigned *lenout) { unsigned int l; int ret; @@ -203,12 +208,21 @@ static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); ret = copy_to_user(to, fifo->buffer + off, l); - - if (unlikely(ret)) - return ret + len - l; + *lenout = l; + if (unlikely(ret)) { + *lenout -= ret; + return -EFAULT; + } /* then get the rest (if any) from the beginning of the buffer */ - return copy_to_user(to + l, fifo->buffer, len - l); + len -= l; + ret = copy_to_user(to + l, fifo->buffer, len); + if (unlikely(ret)) { + *lenout += len - ret; + return -EFAULT; + } + *lenout += len; + return 0; } unsigned int __kfifo_in_n(struct kfifo *fifo, @@ -299,10 +313,13 @@ EXPORT_SYMBOL(__kfifo_out_generic); unsigned int __kfifo_from_user_n(struct kfifo *fifo, const void __user *from, unsigned int len, unsigned int recsize) { + unsigned total; + if (kfifo_avail(fifo) < len + recsize) return len + 1; - return __kfifo_from_user_data(fifo, from, len, recsize); + __kfifo_from_user_data(fifo, from, len, recsize, &total); + return total; } EXPORT_SYMBOL(__kfifo_from_user_n); @@ -313,18 +330,21 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @len: the length of the data to be added. * * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. + * FIFO depending and returns -EFAULT/0. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned *total) { + int ret; len = min(kfifo_avail(fifo), len); - len -= __kfifo_from_user_data(fifo, from, len, 0); + ret = __kfifo_from_user_data(fifo, from, len, 0, total); + if (ret) + return ret; __kfifo_add_in(fifo, len); - return len; + return 0; } EXPORT_SYMBOL(kfifo_from_user); @@ -339,17 +359,17 @@ unsigned int __kfifo_to_user_n(struct kfifo *fifo, void __user *to, unsigned int len, unsigned int reclen, unsigned int recsize) { - unsigned int ret; + unsigned int ret, total; if (kfifo_len(fifo) < reclen + recsize) return len; - ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); if (likely(ret == 0)) __kfifo_add_out(fifo, reclen + recsize); - return ret; + return total; } EXPORT_SYMBOL(__kfifo_to_user_n); @@ -358,20 +378,22 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. + @ @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. + * @to buffer and 0 or -EFAULT. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned *lenout) { + int ret; len = min(kfifo_len(fifo), len); - len -= __kfifo_to_user_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); - return len; + ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); + __kfifo_add_out(fifo, *lenout); + return ret; } EXPORT_SYMBOL(kfifo_to_user); -- cgit v1.2.3 From a5b9e2c1063046421ce01dcf5ddd7ec12567f3e1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:16 -0800 Subject: kfifo: add kfifo_out_peek In some upcoming code it's useful to peek into a FIFO without permanentely removing data. This patch implements a new kfifo_out_peek() to do this. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 3 +++ kernel/kfifo.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 86ad50a900c8..7ad6d32dd673 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -113,6 +113,9 @@ extern unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len); extern __must_check unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len); +extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo, + void *to, unsigned int len, unsigned offset); + /** * kfifo_reset - removes the entire FIFO contents diff --git a/kernel/kfifo.c b/kernel/kfifo.c index b50bb622e8b0..7384f120be87 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -302,6 +302,27 @@ unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) } EXPORT_SYMBOL(kfifo_out); +/** + * kfifo_out_peek - copy some data from the FIFO, but do not remove it + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @offset: offset into the fifo + * + * This function copies at most @len bytes at @offset from the FIFO + * into the @to buffer and returns the number of copied bytes. + * The data is not removed from the FIFO. + */ +unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, + unsigned offset) +{ + len = min(kfifo_len(fifo), len + offset); + + __kfifo_out_data(fifo, to, len, offset); + return len; +} +EXPORT_SYMBOL(kfifo_out_peek); + unsigned int __kfifo_out_generic(struct kfifo *fifo, void *to, unsigned int len, unsigned int recsize, unsigned int *total) -- cgit v1.2.3 From 5dab600e6a153ceb64832f608069e6c08185411a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:17 -0800 Subject: kfifo: document everywhere that size has to be power of two On my first try using them I missed that the fifos need to be power of two, resulting in a runtime bug. Document that requirement everywhere (and fix one grammar bug) Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 4 ++-- kernel/kfifo.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index c8618243ca5a..6f6c5f300af6 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -67,7 +67,7 @@ struct kfifo { /** * DECLARE_KFIFO - macro to declare a kfifo and the associated buffer * @name: name of the declared kfifo datatype - * @size: size of the fifo buffer + * @size: size of the fifo buffer. Must be a power of two. * * Note1: the macro can be used inside struct or union declaration * Note2: the macro creates two objects: @@ -91,7 +91,7 @@ union { \ /** * DEFINE_KFIFO - macro to define and initialize a kfifo * @name: name of the declared kfifo datatype - * @size: size of the fifo buffer + * @size: size of the fifo buffer. Must be a power of two. * * Note1: the macro can be used for global and local kfifo data type variables * Note2: the macro creates two objects: diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 7384f120be87..32c5c15d750d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -41,7 +41,7 @@ static void _kfifo_init(struct kfifo *fifo, void *buffer, * kfifo_init - initialize a FIFO using a preallocated buffer * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. + * @size: the size of the internal buffer, this has to be a power of 2. * */ void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) -- cgit v1.2.3 From af2422c42c0ff42b8b93dbb3a5fe65250fb65c40 Mon Sep 17 00:00:00 2001 From: David John Date: Fri, 15 Jan 2010 17:01:23 -0800 Subject: smp_call_function_any(): pass the node value to cpumask_of_node() The change in acpi_cpufreq to use smp_call_function_any causes a warning when it is called since the function erroneously passes the cpu id to cpumask_of_node rather than the node that the cpu is on. Fix this. cpumask_of_node(3): node > nr_node_ids(1) Pid: 1, comm: swapper Not tainted 2.6.33-rc3-00097-g2c1f189 #223 Call Trace: [] cpumask_of_node+0x23/0x58 [] smp_call_function_any+0x65/0xfa [] ? do_drv_read+0x0/0x2f [] get_cur_val+0xb0/0x102 [] get_cur_freq_on_cpu+0x74/0xc5 [] acpi_cpufreq_cpu_init+0x417/0x515 [] ? __down_write+0xb/0xd [] cpufreq_add_dev+0x278/0x922 Signed-off-by: David John Cc: Suresh Siddha Cc: Rusty Russell Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index de735a6637d0..f10408422444 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask, goto call; /* Try for same node. */ - nodemask = cpumask_of_node(cpu); + nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) -- cgit v1.2.3 From ea9d8e3f45404d411c00ae67b45cc35c58265bb7 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 7 Jan 2010 11:22:44 +0800 Subject: clockevent: Don't remove broadcast device when cpu is dead Marc reported that the BUG_ON in clockevents_notify() triggers on his system. This happens because the kernel tries to remove an active clock event device (used for broadcasting) from the device list. The handling of devices which can be used as per cpu device and as a global broadcast device is suboptimal. The simplest solution for now (and for stable) is to check whether the device is used as global broadcast device, but this needs to be revisited. [ tglx: restored the cpuweight check and massaged the changelog ] Reported-by: Marc Dionne Tested-by: Marc Dionne Signed-off-by: Xiaotian Feng LKML-Reference: <1262834564-13033-1-git-send-email-dfeng@redhat.com> Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/clockevents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 6f740d9f0948..d7395fdfb9f3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg) cpu = *((int *)arg); list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1) { + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); list_del(&dev->list); } -- cgit v1.2.3 From 50b926e439620c469565e8be0f28be78f5fca1ce Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 4 Jan 2010 14:44:56 +0100 Subject: sched: Fix vmark regression on big machines SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't enabled, leading to many cache misses on large machines as we traverse looking for an idle shared cache to wake to. Change the enabler of select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the sibling domain level. Reported-by: Lin Ming Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1262612696.15495.15.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/topology.h | 2 +- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/topology.h b/include/linux/topology.h index 57e63579bfdd..5b81156780b1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -99,7 +99,7 @@ int arch_update_cpu_topology(void); | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ , \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42ac3c9f66f6..8fe7ee81c552 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { -- cgit v1.2.3 From 6d558c3ac9b6508d26fd5cadccce51fc9d726b1c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 11 Jan 2010 14:21:25 +0800 Subject: sched: Reassign prev and switch_count when reacquire_kernel_lock() fail Assume A->B schedule is processing, if B have acquired BKL before and it need reschedule this time. Then on B's context, it will go to need_resched_nonpreemptible for reschedule. But at this time, prev and switch_count are related to A. It's wrong and will lead to incorrect scheduler statistics. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <2674af741001102238w7b0ddcadref00d345e2181d11@mail.gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..4508fe7048be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5530,8 +5530,11 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (need_resched()) -- cgit v1.2.3 From fe432200abb0d64f409895168d9ad8fbb9d8e6c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:08:26 +0100 Subject: perf: Fix perf_event_do_pending() fallback callsite Paul questioned the context in which we should call perf_event_do_pending(). After looking at that I found that it should be called from IRQ context these days, however the fallback call-site is placed in softirq context. Ammend this by placing the callback in the IRQ timer path. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1263374859.4244.192.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/timer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 15533b792397..c61a7949387f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1198,6 +1198,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); + perf_event_do_pending(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_event_do_pending(); - hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) -- cgit v1.2.3 From 22e190851f8709c48baf00ed9ce6144cdc54d025 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:12:32 +0100 Subject: perf: Honour event state for aux stream data Anton reported that perf record kept receiving events even after calling ioctl(PERF_EVENT_IOC_DISABLE). It turns out that FORK,COMM and MMAP events didn't respect the disabled state and kept flowing in. Reported-by: Anton Blanchard Signed-off-by: Peter Zijlstra Tested-by: Anton Blanchard LKML-Reference: <1263459187.4244.265.camel@laptop> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 603c0d8b5df1..d27746bd3a06 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3268,6 +3268,9 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3377,6 +3380,9 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3494,6 +3500,9 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; -- cgit v1.2.3 From fabf318e5e4bda0aca2b0d617b191884fda62703 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2010 21:04:57 +0100 Subject: sched: Fix fork vs hotplug vs cpuset namespaces There are a number of issues: 1) TASK_WAKING vs cgroup_clone (cpusets) copy_process(): sched_fork() child->state = TASK_WAKING; /* waiting for wake_up_new_task() */ if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); while (child->state == TASK_WAKING) cpu_relax(); will deadlock the system. 2) cgroup_clone (cpusets) vs copy_process So even if the above would work we still have: copy_process(): if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); ... p->cpus_allowed = current->cpus_allowed over-writing the modified cpus_allowed. 3) fork() vs hotplug if we unplug the child's cpu after the sanity check when the child gets attached to the task_list but before wake_up_new_task() shit will meet with fan. Solve all these issues by moving fork cpu selection into wake_up_new_task(). Reported-by: Serge E. Hallyn Tested-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra LKML-Reference: <1264106190.4283.1314.camel@laptop> Signed-off-by: Thomas Gleixner --- kernel/fork.c | 15 --------------- kernel/sched.c | 39 +++++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..f88bd984df35 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/sched.c b/kernel/sched.c index 4508fe7048be..3a8fb30a91b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * Called from: + * Gets called from 3 sites (exec, fork, wakeup), since it is called without + * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done + * by: * - * - fork, @p is stable because it isn't on the tasklist yet - * - * - exec, @p is unstable, retry loop - * - * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so - * we should be good. + * exec: is unstable, retry loop + * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) @@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); -#ifdef CONFIG_SMP - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); -#endif set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We still have TASK_WAKING but PF_STARTING is gone now, meaning + * ->cpus_allowed is stable, we have preemption disabled, meaning + * cpu_online_mask is stable. + */ + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +#endif rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_WAKING); @@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) * the ->cpus_allowed mask from under waking tasks, which would be * possible when we change rq->lock in ttwu(), so synchronize against * TASK_WAKING to avoid that. + * + * Make an exception for freshly cloned tasks, since cpuset namespaces + * might move the task about, we have to validate the target in + * wake_up_new_task() anyway since the cpu might have gone away. */ again: - while (p->state == TASK_WAKING) + while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) cpu_relax(); rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING) { + if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { task_rq_unlock(rq, &flags); goto again; } -- cgit v1.2.3 From 74bf4076f2ed79b5510440b72a561823a8852ec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Jan 2010 15:11:53 -0500 Subject: tracing: Prevent kernel oops with corrupted buffer If the contents of the ftrace ring buffer gets corrupted and the trace file is read, it could create a kernel oops (usualy just killing the user task thread). This is caused by the checking of the pid in the buffer. If the pid is negative, it still references the cmdline cache array, which could point to an invalid address. The simple fix is to test for negative PIDs. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..eac6875cb990 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, ""); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; -- cgit v1.2.3 From 7b7422a566aa0dc1e582ce263d4c7ff4a772700a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 Jan 2010 12:51:10 +0100 Subject: clocksource: Prevent potential kgdb dead lock commit 0f8e8ef7 (clocksource: Simplify clocksource watchdog resume logic) introduced a potential kgdb dead lock. When the kernel is stopped by kgdb inside code which holds watchdog_lock then kgdb dead locks in clocksource_resume_watchdog(). clocksource_resume_watchdog() is called from kbdg via clocksource_touch_watchdog() to avoid that the clock source watchdog marks TSC unstable after the kernel has been stopped. Solve this by replacing spin_lock with a spin_trylock and just return in case the lock is held. Not resetting the watchdog might result in TSC becoming marked unstable, but that's an acceptable penalty for using kgdb. The timekeeping is anyway easily screwed up by kgdb when the system uses either jiffies or a clock source which wraps in short intervals (e.g. pm_timer wraps about every 4.6s), so we really do not have to worry about that occasional TSC marked unstable side effect. The second caller of clocksource_resume_watchdog() is clocksource_resume(). The trylock is safe here as well because the system is UP at this point, interrupts are disabled and nothing else can hold watchdog_lock(). Reported-by: Jason Wessel LKML-Reference: <1264480000-6997-4-git-send-email-jason.wessel@windriver.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: Martin Schwidefsky Cc: John Stultz Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d34..13700833c181 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) { unsigned long flags; - spin_lock_irqsave(&watchdog_lock, flags); + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; clocksource_reset_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -458,8 +470,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { -- cgit v1.2.3 From 492a74f4210e15f4701422e2e1c4cd3c1e45ddae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Jan 2010 15:17:47 -0500 Subject: ring-buffer: Check if ring buffer iterator has stale data Usually reads of the ring buffer is performed by a single task. There are two types of reads from the ring buffer. One is a consuming read which will consume the entry that was read and the next read will be the entry that follows. The other is an iterator that will let the user read the contents of the ring buffer without modifying it. When an iterator is allocated, writes to the ring buffer are disabled to protect the iterator. The problem exists when consuming reads happen while an iterator is allocated. Specifically, the kind of read that swaps out an entire page (used by splice) and replaces it with a new read. If the iterator is on the page that is swapped out, then the next read may read from this swapped out page and return garbage. This patch adds a check when reading the iterator to make sure that the iterator contents are still valid. If a consuming read has taken place, the iterator is reset. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801b..503b630e0bda 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -464,6 +464,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -3066,6 +3070,15 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: /* * We repeat when a timestamp is encountered. -- cgit v1.2.3 From 3c05d7482777f15e71bb4cb1ba78dee2800dfec6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 Jan 2010 16:14:08 -0500 Subject: ring-buffer: Check for end of page in iterator If the iterator comes to an empty page for some reason, or if the page is emptied by a consuming read. The iterator code currently does not check if the iterator is pass the contents, and may return a false entry. This patch adds a check to the ring buffer iterator to test if the current page has been completely read and sets the iterator to the next page if necessary. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 503b630e0bda..8c1b2d290718 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3064,9 +3064,6 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3080,6 +3077,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) rb_iter_reset(iter); again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3094,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { -- cgit v1.2.3 From 03688970347bfea32823953a7ce5886d1713205f Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 22 Jan 2010 08:12:47 -0500 Subject: tracing/documentation: Cover new frame pointer semantics Update the graph tracer examples to cover the new frame pointer semantics (in terms of passing it along). Move the HAVE_FUNCTION_GRAPH_FP_TEST docs out of the Kconfig, into the right place, and expand on the details. Signed-off-by: Mike Frysinger LKML-Reference: <1264165967-18938-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 26 +++++++++++++++++++++++--- kernel/trace/Kconfig | 4 +--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 239f14b2b55a..6a5a579126b0 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -1,5 +1,6 @@ function tracer guts ==================== + By Mike Frysinger Introduction ------------ @@ -173,14 +174,16 @@ void ftrace_graph_caller(void) unsigned long *frompc = &...; unsigned long selfpc = - MCOUNT_INSN_SIZE; - prepare_ftrace_return(frompc, selfpc); + /* passing frame pointer up is optional -- see below */ + prepare_ftrace_return(frompc, selfpc, frame_pointer); /* restore all state needed by the ABI */ } #endif -For information on how to implement prepare_ftrace_return(), simply look at -the x86 version. The only architecture-specific piece in it is the setup of +For information on how to implement prepare_ftrace_return(), simply look at the +x86 version (the frame pointer passing is optional; see the next section for +more information). The only architecture-specific piece in it is the setup of the fault recovery table (the asm(...) code). The rest should be the same across architectures. @@ -205,6 +208,23 @@ void return_to_handler(void) #endif +HAVE_FUNCTION_GRAPH_FP_TEST +--------------------------- + +An arch may pass in a unique value (frame pointer) to both the entering and +exiting of a function. On exit, the value is compared and if it does not +match, then it will panic the kernel. This is largely a sanity check for bad +code generation with gcc. If gcc for your port sanely updates the frame +pointer under different opitmization levels, then ignore this option. + +However, adding support for it isn't terribly difficult. In your assembly code +that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument. +Then in the C version of that function, do what the x86 port does and pass it +along to ftrace_push_return_trace() instead of a stub value of 0. + +Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer. + + HAVE_FTRACE_NMI_ENTER --------------------- diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f289..60e2ce0181ee 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool -- cgit v1.2.3 From 48d50674179981e41f432167b2441cec782d5484 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 Jan 2010 19:16:41 +0100 Subject: lockdep: Fix check_usage_backwards() error message Lockdep has found the real bug, but the output doesn't look right to me: > ========================================================= > [ INFO: possible irq lock inversion dependency detected ] > 2.6.33-rc5 #77 > --------------------------------------------------------- > emacs/1609 just changed the state of lock: > (&(&tty->ctrl_lock)->rlock){+.....}, at: [] tty_fasync+0xe8/0x190 > but this lock took another, HARDIRQ-unsafe lock in the past: > (&(&sighand->siglock)->rlock){-.....} "HARDIRQ-unsafe" and "this lock took another" looks wrong, afaics. > ... key at: [] __key.46539+0x0/0x8 > ... acquired at: > [] __lock_acquire+0x1056/0x15a0 > [] lock_acquire+0x9f/0x120 > [] _raw_spin_lock_irqsave+0x52/0x90 > [] __proc_set_tty+0x3e/0x150 > [] tty_open+0x51d/0x5e0 The stack-trace shows that this lock (ctrl_lock) was taken under ->siglock (which is hopefully irq-safe). This is a clear typo in check_usage_backwards() where we tell the print a fancy routine we're forwards. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100126181641.GA10460@redhat.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe49..c62ec14609b9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return ret; return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) -- cgit v1.2.3 From 11854247e2c851e7ff9ce138e501c6cffc5a4217 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2010 16:34:27 +0100 Subject: sched: Fix incorrect sanity check We moved to migrate on wakeup, which means that sleeping tasks could still be present on offline cpus. Amend the check to only test running tasks. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1c8ddd6ee940..08e54e7beaed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -151,7 +151,7 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ -- cgit v1.2.3 From 9d3cfc4c1d17c6d3bc1373e3b954c56b92607755 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Mon, 25 Jan 2010 14:56:34 +0100 Subject: sched: Correct printk whitespace in warning from cpu down task check Due to an incorrect line break the output currently contains tabs. Also remove trailing space. The actual output that logcheck sent me looked like this: Task events/1 (pid = 10) is on cpu 1^I^I^I^I(state = 1, flags = 84208040) After this patch it becomes: Task events/1 (pid = 10) is on cpu 1 (state = 1, flags = 84208040) Signed-off-by: Frans Pop Signed-off-by: Peter Zijlstra LKML-Reference: <201001251456.34996.elendil@planet.nl> Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 08e54e7beaed..677f25376a38 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -154,10 +154,10 @@ static inline void check_for_tasks(int cpu) if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } -- cgit v1.2.3 From b23ff0e9330e4b11e18af984d50573598e10e7f9 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 21 Jan 2010 18:25:16 +0530 Subject: hw_breakpoints: Release the bp slot if arch_validate_hwbkpt_settings() fails. On a given architecture, when hardware breakpoint registration fails due to un-supported access type (read/write/execute), we lose the bp slot since register_perf_hw_breakpoint() does not release the bp slot on failure. Hence, any subsequent hardware breakpoint registration starts failing with 'no space left on device' error. This patch introduces error handling in register_perf_hw_breakpoint() function and releases bp slot on error. Signed-off-by: Mahesh Salgaonkar Cc: Ananth N Mavinakayanahalli Cc: K. Prasad Cc: Maneesh Soni LKML-Reference: <20100121125516.GA32521@in.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 50dbd5999588..c030ae657f20 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -296,6 +296,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + return ret; } -- cgit v1.2.3 From cc0967490c1c3824bc5b75718b6ca8a51d9f2617 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:42 -0600 Subject: x86, hw_breakpoints, kgdb: Fix kgdb to use hw_breakpoint API In the 2.6.33 kernel, the hw_breakpoint API is now used for the performance event counters. The hw_breakpoint_handler() now consumes the hw breakpoints that were previously set by kgdb arch specific code. In order for kgdb to work in conjunction with this core API change, kgdb must use some of the low level functions of the hw_breakpoint API to install, uninstall, and deal with hw breakpoint reservations. The kgdb core required a change to call kgdb_disable_hw_debug anytime a slave cpu enters kgdb_wait() in order to keep all the hw breakpoints in sync as well as to prevent hitting a hw breakpoint while kgdb is active. During the architecture specific initialization of kgdb, it will pre-allocate 4 disabled (struct perf event **) structures. Kgdb will use these to manage the capabilities for the 4 hw breakpoint registers, per cpu. Right now the hw_breakpoint API does not have a way to ask how many breakpoints are available, on each CPU so it is possible that the install of a breakpoint might fail when kgdb restores the system to the run state. The intent of this patch is to first get the basic functionality of hw breakpoints working and leave it to the person debugging the kernel to understand what hw breakpoints are in use and what restrictions have been imposed as a result. Breakpoint constraints will be dealt with in a future patch. While atomic, the x86 specific kgdb code will call arch_uninstall_hw_breakpoint() and arch_install_hw_breakpoint() to manage the cpu specific hw breakpoints. The net result of these changes allow kgdb to use the same pool of hw_breakpoints that are used by the perf event API, but neither knows about future reservations for the available hw breakpoint slots. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 171 ++++++++++++++++++++++++++++++++----------------- kernel/kgdb.c | 3 + 2 files changed, 117 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index dd74fe7273b1..62bea7307eaa 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -204,40 +205,38 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; } - if (correctit) - set_debugreg(dr7, 7); + hw_breakpoint_restore(); } static int @@ -259,15 +258,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -278,27 +285,38 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + breakinfo[i].enabled = 1; return 0; } @@ -313,8 +331,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -378,7 +409,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -404,20 +434,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -485,8 +501,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -539,7 +554,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.type = PERF_TYPE_BREAKPOINT; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -550,6 +600,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..c7ade62e4ef0 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs) smp_wmb(); atomic_set(&cpu_in_kgdb[cpu], 1); + /* Disable any cpu specific hw breakpoints */ + kgdb_disable_hw_debug(regs); + /* Wait till primary CPU is done with debugging */ while (atomic_read(&passive_cpu_wait[cpu])) cpu_relax(); -- cgit v1.2.3 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hw_breakpoint.h | 2 ++ kernel/hw_breakpoint.c | 52 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 95 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 41235c93e4e9..070ba0621738 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -75,6 +75,8 @@ extern int __register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); +extern int dbg_reserve_bp_slot(struct perf_event *bp); +extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..8a5c7d55ac9f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { -- cgit v1.2.3 From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 27 Jan 2010 16:25:22 -0600 Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kgdb.c | 6 +++--- kernel/softlockup.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7bba93929b..89232151a9d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -310,6 +310,7 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); +extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void __user *buffer, @@ -323,6 +324,9 @@ static inline void softlockup_tick(void) static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) +{ +} static inline void touch_all_softlockup_watchdogs(void) { } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..87f2cc557553 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1450,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1550,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..0d4c7898ab80 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } -- cgit v1.2.3 From 4f48f8b7fd18c44f8478174f9925cc3c059c6ce4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 2 Feb 2010 15:32:09 +0800 Subject: tracing: Fix circular dead lock in stack trace When we cat /tracing/stack_trace, we may cause circular lock: sys_read() t_start() arch_spin_lock(&max_stack_lock); t_show() seq_printf(), vsnprintf() .... /* they are all trace-able, when they are traced, max_stack_lock may be required again. */ The following script can trigger this circular dead lock very easy: #!/bin/bash echo 1 > /proc/sys/kernel/stack_tracer_enabled mount -t debugfs xxx /mnt > /dev/null 2>&1 ( # make check_stack() zealous to require max_stack_lock for ((; ;)) { echo 1 > /mnt/tracing/stack_max_size } ) & for ((; ;)) { cat /mnt/tracing/stack_trace > /dev/null } To fix this bug, we increase the percpu trace_active before require the lock. Reported-by: Li Zefan Signed-off-by: Lai Jiangshan LKML-Reference: <4B67D4F9.9080905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } -- cgit v1.2.3 From b8a1d37c5f981cdd2e83c9fd98198832324cd57a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 3 Feb 2010 09:31:36 +1100 Subject: kernel/cred.c: use kmem_cache_free Free memory allocated using kmem_cache_zalloc using kmem_cache_free rather than kfree. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x,E,c; @@ x = \(kmem_cache_alloc\|kmem_cache_zalloc\|kmem_cache_alloc_node\)(c,...) ... when != x = E when != &x ?-kfree(x) +kmem_cache_free(c,x) // Signed-off-by: Julia Lawall Acked-by: David Howells Cc: James Morris Cc: Steve Dickson Cc: Signed-off-by: Andrew Morton Signed-off-by: James Morris --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index dd76cfe5f5b0..1ed8ca18790c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_KEYS new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); if (!new->tgcred) { - kfree(new); + kmem_cache_free(cred_jar, new); return NULL; } atomic_set(&new->tgcred->usage, 1); -- cgit v1.2.3 From bc173f7092c76a7967f135c2b3a54052ad99733b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 2 Feb 2010 13:44:01 -0800 Subject: kfifo: fix kernel-doc notation Fix kfifo kernel-doc warnings: Warning(kernel/kfifo.c:361): No description found for parameter 'total' Warning(kernel/kfifo.c:402): bad line: @ @lenout: pointer to output variable with copied data Warning(kernel/kfifo.c:412): No description found for parameter 'lenout' Signed-off-by: Randy Dunlap Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 32c5c15d750d..498cabba225e 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -349,6 +349,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @fifo: the fifo to be used. * @from: pointer to the data to be added. * @len: the length of the data to be added. + * @total: the actual returned data length. * * This function copies at most @len bytes from the @from into the * FIFO depending and returns -EFAULT/0. @@ -399,7 +400,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. - @ @lenout: pointer to output variable with copied data + * @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the * @to buffer and 0 or -EFAULT. -- cgit v1.2.3 From 4528fd0595847c2078b59f24800e751c2d6b7e41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 2 Feb 2010 13:44:10 -0800 Subject: cgroups: fix to return errno in a failure path In cgroup_create(), if alloc_css_id() returns failure, the errno is not propagated to userspace, so mkdir will fail silently. To trigger this bug, we mount blkio (or memory subsystem), and create more then 65534 cgroups. (The number of cgroups is limited to 65535 if a subsystem has use_id == 1) # mount -t cgroup -o blkio xxx /mnt # for ((i = 0; i < 65534; i++)); do mkdir /mnt/$i; done # mkdir /mnt/65534 (should return ENOSPC) # Signed-off-by: Li Zefan Acked-by: Serge Hallyn Acked-by: Paul Menage Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1fbcc748044a..aa3bee566446 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2936,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); - if (ss->use_id) - if (alloc_css_id(ss, parent, cgrp)) + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) goto err_destroy; + } /* At error, ->destroy() callback has to free assigned ID. */ } -- cgit v1.2.3 From 5ecb01cfdf96c5f465192bdb2a4fd4a61a24c6cc Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sat, 23 Jan 2010 22:36:29 +0100 Subject: futex_lock_pi() key refcnt fix This fixes a futex key reference count bug in futex_lock_pi(), where a key's reference count is incremented twice but decremented only once, causing the backing object to not be released. If the futex is created in a temporary file in an ext3 file system, this bug causes the file's inode to become an "undead" orphan, which causes an oops from a BUG_ON() in ext3_put_super() when the file system is unmounted. glibc's test suite is known to trigger this, see . The bug is a regression from 2.6.28-git3, namely Peter Zijlstra's 38d47c1b7075bd7ec3881141bb3629da58f88dab "[PATCH] futex: rely on get_user_pages() for shared futexes". That commit made get_futex_key() also increment the reference count of the futex key, and updated its callers to decrement the key's reference count before returning. Unfortunately the normal exit path in futex_lock_pi() wasn't corrected: the reference count is incremented by get_futex_key() and queue_lock(), but the normal exit path only decrements once, via unqueue_me_pi(). The fix is to put_futex_key() after unqueue_me_pi(), since 2.6.31 this is easily done by 'goto out_put_key' rather than 'goto out'. Signed-off-by: Mikael Pettersson Acked-by: Peter Zijlstra Acked-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9d..17828033a639 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1971,7 +1971,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); -- cgit v1.2.3 From 51246bfd189064079c54421507236fd2723b18f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Feb 2010 11:40:27 +0100 Subject: futex: Handle user space corruption gracefully If the owner of a PI futex dies we fix up the pi_state and set pi_state->owner to NULL. When a malicious or just sloppy programmed user space application sets the futex value to 0 e.g. by calling pthread_mutex_init(), then the futex can be acquired again. A new waiter manages to enqueue itself on the pi_state w/o damage, but on unlock the kernel dereferences pi_state->owner and oopses. Prevent this by checking pi_state->owner in the unlock path. If pi_state->owner is not current we know that user space manipulated the futex value. Ignore the mess and return -EINVAL. This catches the above case and also the case where a task hijacks the futex by setting the tid value and then tries to unlock it. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 17828033a639..06e8240d2abe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -758,6 +758,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); -- cgit v1.2.3 From 59647b6ac3050dd964bc556fe6ef22f4db5b935c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Feb 2010 09:33:05 +0100 Subject: futex: Handle futex value corruption gracefully The WARN_ON in lookup_pi_state which complains about a mismatch between pi_state->owner->pid and the pid which we retrieved from the user space futex is completely bogus. The code just emits the warning and then continues despite the fact that it detected an inconsistent state of the futex. A conveniant way for user space to spam the syslog. Replace the WARN_ON by a consistency check. If the values do not match return -EINVAL and let user space deal with the mess it created. This also fixes the missing task_pid_vnr() when we compare the pi_state->owner pid with the futex value. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06e8240d2abe..e7a35f1039e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; -- cgit v1.2.3 From b9c3032277f756e73f6c673419dc414155e04e46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Feb 2010 18:08:52 +0100 Subject: hrtimer, softirq: Fix hrtimer->softirq trampoline hrtimers callbacks are always done from hardirq context, either the jiffy tick interrupt or the hrtimer device interrupt. [ there is currently one exception that can still call a hrtimer callback from softirq, but even in that case this will still work correctly. ] Reported-by: Wei Yongjun Signed-off-by: Peter Zijlstra Cc: Yury Polyanskiy Tested-by: Wei Yongjun Acked-by: David S. Miller LKML-Reference: <1265120401.24455.306.camel@laptop> Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef75..7c1a67ef0274 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* -- cgit v1.2.3 From cd757645fbdc34a8343c04bb0e74e06fccc2cb10 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Sat, 30 Jan 2010 10:25:18 +0530 Subject: perf: Make bp_len type to u64 generic across the arch Change 'bp_len' type to __u64 to make it work across archs as the s390 architecture watch point length can be upto 2^64. reference: http://lkml.org/lkml/2010/1/25/212 This is an ABI change that is not backward compatible with the previous hardware breakpoint info layout integrated in this development cycle, a rebuilt of perf tools is necessary for versions based on 2.6.33-rc1 - 2.6.33-rc6 to work with a kernel based on this patch. Signed-off-by: Mahesh Salgaonkar Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: "K. Prasad" Cc: Maneesh Soni Cc: Heiko Carstens Cc: Martin LKML-Reference: <20100130045518.GA20776@in.ibm.com> Signed-off-by: Frederic Weisbecker --- include/linux/hw_breakpoint.h | 2 +- include/linux/perf_event.h | 6 ++---- kernel/hw_breakpoint.c | 2 +- kernel/perf_event.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 070ba0621738..5977b724f7c6 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -44,7 +44,7 @@ static inline int hw_breakpoint_type(struct perf_event *bp) return bp->attr.bp_type; } -static inline int hw_breakpoint_len(struct perf_event *bp) +static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8fa71874113f..a177698d95e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -211,11 +211,9 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - __u32 __reserved_2; - - __u64 bp_addr; __u32 bp_type; - __u32 bp_len; + __u64 bp_addr; + __u64 bp_len; }; /* diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8a5c7d55ac9f..967e66143e11 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -360,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d27746bd3a06..2b19297742cb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4580,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit v1.2.3 From c93d89f3dbf0202bf19c07960ca8602b48c2f9a0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 27 Jan 2010 19:13:40 +0800 Subject: Export the symbol of getboottime and mmonotonic_to_bootbased Export getboottime and monotonic_to_bootbased in order to let them could be used by following patch. Cc: stable@kernel.org Signed-off-by: Jason Wang Signed-off-by: Marcelo Tosatti --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4f..e2ab064c6d41 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -880,6 +880,7 @@ void getboottime(struct timespec *ts) set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts) { *ts = timespec_add_safe(*ts, total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { -- cgit v1.2.3 From a9bb18f36c8056f0712fb28c52c0f85d98438dfb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 10 Feb 2010 17:23:47 +0100 Subject: tracing/kprobes: Fix probe parsing Trying to add a probe like: echo p:myprobe 0x10000 > /sys/kernel/debug/tracing/kprobe_events will fail since the wrong pointer is passed to strict_strtoul when trying to convert the address to an unsigned long. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <20100210162346.GA6933@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c96..50b1b8239806 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; -- cgit v1.2.3 From 6f93d0a7c83772997b81c30d6f519a9a5dbab6a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 14 Feb 2010 11:12:04 +0100 Subject: perf_events: Fix FORK events Commit 22e19085 ("Honour event state for aux stream data") introduced a bug where we would drop FORK events. The thing is that we deliver FORK events to the child process' event, which at that time will be PERF_EVENT_STATE_INACTIVE because the child won't be scheduled in (we're in the middle of fork). Solve this twice, change the event state filter to exclude only disabled (STATE_OFF) or worse, and deliver FORK events to the current (parent). Signed-off-by: Peter Zijlstra Cc: Anton Blanchard Cc: Arnaldo Carvalho de Melo LKML-Reference: <1266142324.5273.411.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2b19297742cb..2ae7409bf38f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3259,8 +3259,6 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); @@ -3268,7 +3266,7 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3300,7 +3298,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_event_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); put_cpu_var(perf_cpu_context); @@ -3331,6 +3329,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ + .time = perf_clock(), }, }; @@ -3380,7 +3379,7 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3500,7 +3499,7 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) -- cgit v1.2.3 From 1a02d59aba9b61b820517fb135086471c065b573 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 27 Jan 2010 17:09:34 +0300 Subject: kfifo: Make kfifo_initialized work after kfifo_free After kfifo rework it's no longer possible to reliably know if kfifo is usable, since after kfifo_free(), kfifo_initialized() would still return true. The correct behaviour is needed for at least FHCI USB driver. This patch fixes the issue by resetting the kfifo to zero values (the same approach is used in kfifo_alloc() if allocation failed). Signed-off-by: Anton Vorontsov Acked-by: Stefani Seibold Signed-off-by: Greg Kroah-Hartman --- kernel/kfifo.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 498cabba225e..559fb5582b60 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); -- cgit v1.2.3 From 5a5e0f4c7038168e38d1db6af09d1ac715ee9888 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 27 Jan 2010 17:09:38 +0300 Subject: kfifo: Don't use integer as NULL pointer This patch fixes following sparse warnings: include/linux/kfifo.h:127:25: warning: Using plain integer as NULL pointer kernel/kfifo.c:83:21: warning: Using plain integer as NULL pointer Signed-off-by: Anton Vorontsov Acked-by: Stefani Seibold Signed-off-by: Greg Kroah-Hartman --- include/linux/kfifo.h | 2 +- kernel/kfifo.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 6f6c5f300af6..bc0fc795bd35 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -124,7 +124,7 @@ extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo, */ static inline bool kfifo_initialized(struct kfifo *fifo) { - return fifo->buffer != 0; + return fifo->buffer != NULL; } /** diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 559fb5582b60..35edbe22e9a9 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } -- cgit v1.2.3 From 701188374b6f1ef9cf7e4dce4a2e69ef4c0012ac Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 22 Feb 2010 12:44:16 -0800 Subject: kernel/sys.c: fix missing rcu protection for sys_getpriority() find_task_by_vpid() is not safe without rcu_read_lock(). 2.6.33-rc7 got RCU protection for sys_setpriority() but missed it for sys_getpriority(). Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: "Paul E. McKenney" Acked-by: Serge Hallyn Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..18bde979f346 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3