From bfdb4d9f0f611687d71cf6a460efc9e755f4a462 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Tue, 23 Jun 2009 10:00:58 +0530 Subject: timers: Fix timer_migration interface which accepts any number as input Poornima Nayek reported: | Timer migration interface /proc/sys/kernel/timer_migration in | 2.6.30-git9 accepts any numerical value as input. | | Steps to reproduce: | 1. echo -6666666 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -6666666 | | 1. echo 44444444444444444444444444444444444444444444444444444444444 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -1357789412 | | Expected behavior: Should 'echo: write error: Invalid argument' while | setting any value other then 0 & 1 Restrict valid values to 0 and 1. Reported-by: Poornima Nayak Tested-by: Poornima Nayak Signed-off-by: Arun R Bharadwaj Cc: poornima nayak Cc: Arun Bharadwaj LKML-Reference: <20090623043058.GA3249@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b5..c428ba161db1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = { .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, }, #endif { -- cgit v1.2.3 From f29ac756a40d0f1bb07d682ea521e7b666ff06d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 18:27:26 +0200 Subject: perf_counter: Optimize perf_swcounter_event() Similar to tracepoints, use an enable variable to reduce overhead when unused. Only look for a counter of a particular event type when we know there is at least one in the system. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++++- kernel/perf_counter.c | 18 +++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 89698d8aba5c..e7213e46cf9c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -669,7 +669,16 @@ static inline int is_software_counter(struct perf_counter *counter) (counter->attr.type != PERF_TYPE_HW_CACHE); } -extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); +extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; + +extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); + +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swcounter_enabled[event])) + __perf_swcounter_event(event, nr, nmi, regs, addr); +} extern void __perf_counter_mmap(struct vm_area_struct *vma); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea4..7515c7695428 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3317,8 +3317,8 @@ out: put_cpu_var(perf_cpu_context); } -void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +void __perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { .regs = regs, @@ -3509,9 +3509,19 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) } #endif +atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_counter_destroy(struct perf_counter *counter) +{ + u64 event = counter->attr.config; + + atomic_dec(&perf_swcounter_enabled[event]); +} + static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { const struct pmu *pmu = NULL; + u64 event = counter->attr.config; /* * Software counters (currently) can't in general distinguish @@ -3520,7 +3530,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->attr.config) { + switch (event) { case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3541,6 +3551,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + atomic_inc(&perf_swcounter_enabled[event]); + counter->destroy = sw_perf_counter_destroy; pmu = &perf_ops_generic; break; } -- cgit v1.2.3 From b84fbc9fb1d943e2c5f4efe52ed0e3c93a4bdb6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 13:57:40 +0200 Subject: perf_counter: Push inherit into perf_counter_alloc() Teach perf_counter_alloc() about inheritance so that we can optimize the inherit path in the next patch. Remove the child_counter->atrr.inherit = 1 line because the only way to get there is if parent_counter->attr.inherit == 1 and we copy the attrs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7515c7695428..0a45490f4029 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3568,6 +3568,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, + struct perf_counter *parent_counter, gfp_t gfpflags) { const struct pmu *pmu; @@ -3603,6 +3604,8 @@ perf_counter_alloc(struct perf_counter_attr *attr, counter->ctx = ctx; counter->oncpu = -1; + counter->parent = parent_counter; + counter->ns = get_pid_ns(current->nsproxy->pid_ns); counter->id = atomic64_inc_return(&perf_counter_id); @@ -3827,7 +3830,7 @@ SYSCALL_DEFINE5(perf_counter_open, } counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - GFP_KERNEL); + NULL, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) goto err_put_context; @@ -3893,7 +3896,8 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, - group_leader, GFP_KERNEL); + group_leader, parent_counter, + GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; get_ctx(child_ctx); @@ -3916,12 +3920,6 @@ inherit_counter(struct perf_counter *parent_counter, */ add_counter_to_ctx(child_counter, child_ctx); - child_counter->parent = parent_counter; - /* - * inherit into child's child as well: - */ - child_counter->attr.inherit = 1; - /* * Get a reference to the parent filp - we will fput it * when the child counter exits. This is safe to do because -- cgit v1.2.3 From f344011ccb85469445369153c3d27c4ee4bc2ac8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 13:58:35 +0200 Subject: perf_counter: Optimize perf_counter_alloc()'s inherit case We don't need to add usage counts for swcounter and attr usage models for inherited counters since the parent counter will always have one, which suffices to generate the needed output. This avoids up to 3 global atomic increments per inherited counter. LKML-Reference: Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0a45490f4029..c2b19c111718 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1508,11 +1508,13 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); + if (!counter->parent) { + atomic_dec(&nr_counters); + if (counter->attr.mmap) + atomic_dec(&nr_mmap_counters); + if (counter->attr.comm) + atomic_dec(&nr_comm_counters); + } if (counter->destroy) counter->destroy(counter); @@ -3515,6 +3517,8 @@ static void sw_perf_counter_destroy(struct perf_counter *counter) { u64 event = counter->attr.config; + WARN_ON(counter->parent); + atomic_dec(&perf_swcounter_enabled[event]); } @@ -3551,8 +3555,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; + if (!counter->parent) { + atomic_inc(&perf_swcounter_enabled[event]); + counter->destroy = sw_perf_counter_destroy; + } pmu = &perf_ops_generic; break; } @@ -3663,11 +3669,13 @@ done: counter->pmu = pmu; - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); + if (!counter->parent) { + atomic_inc(&nr_counters); + if (counter->attr.mmap) + atomic_inc(&nr_mmap_counters); + if (counter->attr.comm) + atomic_inc(&nr_comm_counters); + } return counter; } -- cgit v1.2.3 From 35aa901c0b66cb3c2eeee23f13624014825a44a8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:33 -0400 Subject: Audit: fix audit watch use after free When an audit watch is added to a parent the temporary watch inside the original krule from userspace is freed. Yet the original watch is used after the real watch was created in audit_add_rules() Signed-off-by: Eric Paris --- kernel/auditfilter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 713098ee5a02..19c0a0a2cede 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1320,6 +1320,8 @@ static inline int audit_add_rule(struct audit_entry *entry) mutex_unlock(&audit_filter_mutex); goto error; } + /* entry->rule.watch may have changed during audit_add_watch() */ + watch = entry->rule.watch; h = audit_hash_ino((u32)watch->ino); list = &audit_inode_hash[h]; } -- cgit v1.2.3 From b87ce6e4187c24b06483c8266822ce5e6b7fa7f3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:34 -0400 Subject: Audit: better estimation of execve record length The audit execve record splitting code estimates the length of the message generated. But it forgot to include the "" that wrap each string in its estimation. This means that execve messages with lots of tiny (1-2 byte) arguments could still cause records greater than 8k to be emitted. Simply fix the estimate. Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7d6ac7c1f414..b14d234b85f3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1024,8 +1024,8 @@ static int audit_log_single_execve_arg(struct audit_context *context, { char arg_num_len_buf[12]; const char __user *tmp_p = p; - /* how many digits are in arg_num? 3 is the length of " a=" */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; + /* how many digits are in arg_num? 5 is the length of ' a=""' */ + size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; size_t len, len_left, to_send; size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; unsigned int i, has_cntl = 0, too_long = 0; -- cgit v1.2.3 From e85188f424c8eec7f311deed9a70bec57aeed741 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:34 -0400 Subject: Audit: dereferencing krule as if it were an audit_watch audit_update_watch() runs all of the rules for a given watch and duplicates them, attaches a new watch to them, and then when it finishes that process and has called free on all of the old rules (ok maybe still inside the rcu grace period) it proceeds to use the last element from list_for_each_entry_safe() as if it were a krule rather than being the audit_watch which was anchoring the list to output a message about audit rules changing. This patch unfies the audit message from two different places into a helper function and calls it from the correct location in audit_update_rules(). We will now get an audit message about the config changing for each rule (with each rules filterkey) rather than the previous garbage. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 58 ++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 19c0a0a2cede..e7466dd145c9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -977,6 +977,27 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, return entry; } +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + if (r->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, r->filterkey); + } else + audit_log_format(ab, " key=(null)"); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const char *dname, dev_t dev, @@ -1023,24 +1044,11 @@ static void audit_update_watch(struct audit_parent *parent, &nentry->rule.list); } + audit_watch_log_rule_change(r, owatch, "updated rules"); + call_rcu(&oentry->rcu, audit_free_rule_rcu); } - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, - " op=updated rules specifying path="); - audit_log_untrustedstring(ab, owatch->path); - audit_log_format(ab, " with dev=%u ino=%lu\n", - dev, ino); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } audit_remove_watch(owatch); goto add_watch_to_parent; /* event applies to a single watch */ } @@ -1065,25 +1073,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, " op=remove rule path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, - r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", - r->listnr); - audit_log_end(ab); - } + audit_watch_log_rule_change(r, w, "remove rule"); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); -- cgit v1.2.3 From 038cbcf65fd6a30c79e3917690b8c46321a27915 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: unify the printk of an skb when auditd not around Remove code duplication of skb printk when auditd is not around in userspace to deal with this message. Signed-off-by: Eric Paris --- kernel/audit.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 9442c3533ba9..f7ab4a479cdd 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -375,6 +375,25 @@ static void audit_hold_skb(struct sk_buff *skb) kfree_skb(skb); } +/* + * For one reason or another this nlh isn't getting delivered to the userspace + * audit daemon, just send it to printk. + */ +static void audit_printk_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + char *data = NLMSG_DATA(nlh); + + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) + printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); + else + audit_log_lost("printk limit exceeded\n"); + } + + audit_hold_skb(skb); +} + static void kauditd_send_skb(struct sk_buff *skb) { int err; @@ -427,14 +446,8 @@ static int kauditd_thread(void *dummy) if (skb) { if (audit_pid) kauditd_send_skb(skb); - else { - if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); - else - audit_log_lost("printk limit exceeded\n"); - - audit_hold_skb(skb); - } + else + audit_printk_skb(skb); } else { DECLARE_WAITQUEUE(wait, current); set_current_state(TASK_INTERRUPTIBLE); @@ -1475,15 +1488,7 @@ void audit_log_end(struct audit_buffer *ab) skb_queue_tail(&audit_skb_queue, ab->skb); wake_up_interruptible(&kauditd_wait); } else { - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); - } - audit_hold_skb(ab->skb); + audit_printk_skb(ab->skb); } ab->skb = NULL; } -- cgit v1.2.3 From ee080e6ce93d5993390bccf68c1df5efd9351276 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: cleanup netlink mesg handling The audit handling of netlink messages is all over the place. Clean things up, use predetermined macros, generally make it more readable. Signed-off-by: Eric Paris --- kernel/audit.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f7ab4a479cdd..01082a1d2bc5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -528,22 +528,20 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, { struct sk_buff *skb; struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; - skb = alloc_skb(len, GFP_KERNEL); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); + nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); + data = NLMSG_DATA(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_PUT */ +nlmsg_failure: /* Used by NLMSG_NEW */ if (skb) kfree_skb(skb); return NULL; @@ -1083,18 +1081,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, goto err; } - ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); - if (!ab->skb) - goto err; - ab->ctx = ctx; ab->gfp_mask = gfp_mask; - nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); - nlh->nlmsg_type = type; - nlh->nlmsg_flags = 0; - nlh->nlmsg_pid = 0; - nlh->nlmsg_seq = 0; + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto nlmsg_failure; + + nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + return ab; + +nlmsg_failure: /* Used by NLMSG_NEW */ + kfree_skb(ab->skb); + ab->skb = NULL; err: audit_buffer_free(ab); return NULL; -- cgit v1.2.3 From ea7ae60bfe39aeedfb29571c47280bf0067ee5f3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: clean up audit_receive_skb audit_receive_skb is hard to clearly parse what it is doing to the netlink message. Clean the function up so it is easy and clear to see what is going on. Signed-off-by: Eric Paris --- kernel/audit.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 01082a1d2bc5..ce77e81a0e71 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -937,28 +937,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* - * Get message from skb (based on rtnetlink_rcv_skb). Each message is - * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. + * Get message from skb. Each message is processed by audit_receive_msg. + * Malformed skbs with wrong length are discarded silently. */ static void audit_receive_skb(struct sk_buff *skb) { - int err; - struct nlmsghdr *nlh; - u32 rlen; + struct nlmsghdr *nlh; + /* + * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * if the nlmsg_len was not aligned + */ + int len; + int err; - while (skb->len >= NLMSG_SPACE(0)) { - nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if ((err = audit_receive_msg(skb, nlh))) { + nlh = nlmsg_hdr(skb); + len = skb->len; + + while (NLMSG_OK(nlh, len)) { + err = audit_receive_msg(skb, nlh); + /* if err or if this message says it wants a response */ + if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); + + nlh = NLMSG_NEXT(nlh, len); } } -- cgit v1.2.3 From cfcad62c74abfef83762dc05a556d21bdf3980a2 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:36 -0400 Subject: audit: seperate audit inode watches into a subfile In preparation for converting audit to use fsnotify instead of inotify we seperate the inode watching code into it's own file. This is similar to how the audit tree watching code is already seperated into audit_tree.c Signed-off-by: Eric Paris --- kernel/Makefile | 2 +- kernel/audit.c | 16 -- kernel/audit.h | 39 ++-- kernel/audit_watch.c | 534 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/auditfilter.c | 481 ++-------------------------------------------- kernel/auditsc.c | 6 +- 6 files changed, 572 insertions(+), 506 deletions(-) create mode 100644 kernel/audit_watch.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec97..da750010a6fc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,7 +70,7 @@ obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/kernel/audit.c b/kernel/audit.c index ce77e81a0e71..e07ad2340dbe 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -/* Inotify handle. */ -struct inotify_handle *audit_ih; - /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -971,13 +968,6 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } -#ifdef CONFIG_AUDITSYSCALL -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, -}; -#endif - /* Initialize audit support at boot time. */ static int __init audit_init(void) { @@ -1003,12 +993,6 @@ static int __init audit_init(void) audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); -#ifdef CONFIG_AUDITSYSCALL - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); -#endif - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); diff --git a/kernel/audit.h b/kernel/audit.h index 16f18cac661b..704d5b01d9fd 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -53,18 +53,7 @@ enum audit_state { }; /* Rule lists */ -struct audit_parent; - -struct audit_watch { - atomic_t count; /* reference count */ - char *path; /* insertion path */ - dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ - struct audit_parent *parent; /* associated parent */ - struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ -}; - +struct audit_watch; struct audit_tree; struct audit_chunk; @@ -108,19 +97,31 @@ struct audit_netlink_list { int audit_send_list(void *); -struct inotify_watch; -/* Inotify handle */ -extern struct inotify_handle *audit_ih; - -extern void audit_free_parent(struct inotify_watch *); -extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, - const char *, struct inode *); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +/* audit watch functions */ +extern unsigned long audit_watch_inode(struct audit_watch *watch); +extern dev_t audit_watch_dev(struct audit_watch *watch); +extern void audit_put_watch(struct audit_watch *watch); +extern void audit_get_watch(struct audit_watch *watch); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw); +extern void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw); +extern int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw); +extern void audit_remove_watch(struct audit_watch *watch); +extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); +extern void audit_inotify_unregister(struct list_head *in_list); +extern char *audit_watch_path(struct audit_watch *watch); +extern struct list_head *audit_watch_rules(struct audit_watch *watch); + +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch); + #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c new file mode 100644 index 000000000000..da8be6d39c1a --- /dev/null +++ b/kernel/audit_watch.c @@ -0,0 +1,534 @@ +/* audit_watch.c -- watching inodes + * + * Copyright 2003-2009 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "audit.h" + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +static void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + +char *audit_watch_path(struct audit_watch *watch) +{ + return watch->path; +} + +struct list_head *audit_watch_rules(struct audit_watch *watch) +{ + return &watch->rules; +} + +unsigned long audit_watch_inode(struct audit_watch *watch) +{ + return watch->ino; +} + +dev_t audit_watch_dev(struct audit_watch *watch) +{ + return watch->dev; +} + +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + +/* Translate a watch string to kernel respresentation. */ +int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op != Audit_equal || + krule->inode_f || krule->watch || krule->tree) + return -EINVAL; + + watch = audit_init_watch(path); + if (IS_ERR(watch)) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (IS_ERR(new)) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + if (r->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, r->filterkey); + } else + audit_log_format(ab, " key=(null)"); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (IS_ERR(nwatch)) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); + audit_panic("error updating watch, removing"); + } else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); + } + + audit_watch_log_rule_change(r, owatch, "updated rules"); + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + audit_watch_log_rule_change(r, w, "remove rule"); + list_del(&r->rlist); + list_del(&r->list); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); + } +} + +/* Get path information necessary for adding watches. */ +int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_put(&ndp->path); + kfree(ndp); + } + if (ndw) { + path_put(&ndw->path); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +{ + struct audit_watch *watch = krule->watch; + struct audit_parent *parent = watch->parent; + + list_del(&krule->rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, list); + } + } +} + +/* Update watch data in audit rules based on inotify events. */ +static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} + +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; + +static int __init audit_watch_init(void) +{ + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + return 0; +} +subsys_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e7466dd145c9..9d4c93437de6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include "audit.h" @@ -44,36 +43,6 @@ * be written directly provided audit_filter_mutex is held. */ -/* - * Reference counting: - * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED - * event. Each audit_watch holds a reference to its associated parent. - * - * audit_watch: if added to lists, lifetime is from audit_init_watch() to - * audit_remove_watch(). Additionally, an audit_watch may exist - * temporarily to assist in searching existing filter data. Each - * audit_krule holds a reference to its associated watch. - */ - -struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ -}; - -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 - /* Audit filter lists, defined in */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), @@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF - -void audit_free_parent(struct inotify_watch *i_watch) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); -} - -static inline void audit_get_watch(struct audit_watch *watch) -{ - atomic_inc(&watch->count); -} - -static void audit_put_watch(struct audit_watch *watch) -{ - if (atomic_dec_and_test(&watch->count)) { - WARN_ON(watch->parent); - WARN_ON(!list_empty(&watch->rules)); - kfree(watch->path); - kfree(watch); - } -} - -static void audit_remove_watch(struct audit_watch *watch) -{ - list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); - watch->parent = NULL; - audit_put_watch(watch); /* match initial get */ -} - static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } -/* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) -{ - struct audit_parent *parent; - s32 wd; - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (unlikely(!parent)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); - } - - return parent; -} - -/* Initialize a watch entry. */ -static struct audit_watch *audit_init_watch(char *path) -{ - struct audit_watch *watch; - - watch = kzalloc(sizeof(*watch), GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); - watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; - - return watch; -} - /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule, return 0; } -/* Translate a watch string to kernel respresentation. */ -static int audit_to_watch(struct audit_krule *krule, char *path, int len, - u32 op) -{ - struct audit_watch *watch; - - if (!audit_ih) - return -EOPNOTSUPP; - - if (path[0] != '/' || path[len-1] == '/' || - krule->listnr != AUDIT_FILTER_EXIT || - op != Audit_equal || - krule->inode_f || krule->watch || krule->tree) - return -EINVAL; - - watch = audit_init_watch(path); - if (IS_ERR(watch)) - return PTR_ERR(watch); - - audit_get_watch(watch); - krule->watch = watch; - - return 0; -} - static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) @@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) break; case AUDIT_WATCH: data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->watch->path); + audit_pack_string(&bufp, + audit_watch_path(krule->watch)); break; case AUDIT_DIR: data->buflen += data->values[i] = @@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 1; break; case AUDIT_WATCH: - if (strcmp(a->watch->path, b->watch->path)) + if (strcmp(audit_watch_path(a->watch), + audit_watch_path(b->watch))) return 1; break; case AUDIT_DIR: @@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } -/* Duplicate the given audit watch. The new watch's rules list is initialized - * to an empty list and wlist is undefined. */ -static struct audit_watch *audit_dupe_watch(struct audit_watch *old) -{ - char *path; - struct audit_watch *new; - - path = kstrdup(old->path, GFP_KERNEL); - if (unlikely(!path)) - return ERR_PTR(-ENOMEM); - - new = audit_init_watch(path); - if (IS_ERR(new)) { - kfree(path); - goto out; - } - - new->dev = old->dev; - new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); - new->parent = old->parent; - -out: - return new; -} - /* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_lsm_field(struct audit_field *df, @@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -977,127 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, return entry; } -static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) -{ - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_string(ab, op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } -} - -/* Update inode info in audit rules based on filesystem event. */ -static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, - unsigned long ino, unsigned invalidating) -{ - struct audit_watch *owatch, *nwatch, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *oentry, *nentry; - - mutex_lock(&audit_filter_mutex); - list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) - continue; - - /* If the update involves invalidating rules, do the inode-based - * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) - audit_filter_inodes(current, current->audit_context); - - nwatch = audit_dupe_watch(owatch); - if (IS_ERR(nwatch)) { - mutex_unlock(&audit_filter_mutex); - audit_panic("error updating watch, skipping"); - return; - } - nwatch->dev = dev; - nwatch->ino = ino; - - list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { - - oentry = container_of(r, struct audit_entry, rule); - list_del(&oentry->rule.rlist); - list_del_rcu(&oentry->list); - - nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) { - list_del(&oentry->rule.list); - audit_panic("error updating watch, removing"); - } else { - int h = audit_hash_ino((u32)ino); - list_add(&nentry->rule.rlist, &nwatch->rules); - list_add_rcu(&nentry->list, &audit_inode_hash[h]); - list_replace(&oentry->rule.list, - &nentry->rule.list); - } - - audit_watch_log_rule_change(r, owatch, "updated rules"); - - call_rcu(&oentry->rcu, audit_free_rule_rcu); - } - - audit_remove_watch(owatch); - goto add_watch_to_parent; /* event applies to a single watch */ - } - mutex_unlock(&audit_filter_mutex); - return; - -add_watch_to_parent: - list_add(&nwatch->wlist, &parent->watches); - mutex_unlock(&audit_filter_mutex); - return; -} - -/* Remove all watches & rules associated with a parent that is going away. */ -static void audit_remove_parent_watches(struct audit_parent *parent) -{ - struct audit_watch *w, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *e; - - mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; - list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { - list_for_each_entry_safe(r, nextr, &w->rules, rlist) { - e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); - list_del(&r->rlist); - list_del(&r->list); - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - } - audit_remove_watch(w); - } - mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -static void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } -} - /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, @@ -1135,134 +855,6 @@ out: return found; } -/* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, - struct nameidata **ndw) -{ - struct nameidata *ndparent, *ndwatch; - int err; - - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; - - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; - } - - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; - } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; - } - - *ndp = ndparent; - *ndw = ndwatch; - - return 0; -} - -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - -/* Associate the given rule with an existing parent inotify_watch. - * Caller must hold audit_filter_mutex. */ -static void audit_add_to_parent(struct audit_krule *krule, - struct audit_parent *parent) -{ - struct audit_watch *w, *watch = krule->watch; - int watch_found = 0; - - list_for_each_entry(w, &parent->watches, wlist) { - if (strcmp(watch->path, w->path)) - continue; - - watch_found = 1; - - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); - audit_put_watch(watch); - - audit_get_watch(w); - krule->watch = watch = w; - break; - } - - if (!watch_found) { - get_inotify_watch(&parent->wdata); - watch->parent = parent; - - list_add(&watch->wlist, &parent->watches); - } - list_add(&krule->rlist, &watch->rules); -} - -/* Find a matching watch entry, or add this one. - * Caller must hold audit_filter_mutex. */ -static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) -{ - struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; - struct audit_parent *parent; - int ret = 0; - - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } - - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - mutex_unlock(&audit_filter_mutex); - - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { - parent = audit_init_parent(ndp); - if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); - } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); - - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); - - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); - return ret; -} - static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; @@ -1297,7 +889,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* Avoid calling path_lookup under audit_filter_mutex. */ if (watch) { - err = audit_get_nd(watch->path, &ndp, &ndw); + err = audit_get_nd(audit_watch_path(watch), &ndp, &ndw); if (err) goto error; } @@ -1312,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) } /* entry->rule.watch may have changed during audit_add_watch() */ watch = entry->rule.watch; - h = audit_hash_ino((u32)watch->ino); + h = audit_hash_ino((u32)audit_watch_inode(watch)); list = &audit_inode_hash[h]; } if (tree) { @@ -1364,7 +956,7 @@ error: static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch, *tmp_watch = entry->rule.watch; + struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; LIST_HEAD(inotify_list); @@ -1386,29 +978,8 @@ static inline int audit_del_rule(struct audit_entry *entry) goto out; } - watch = e->rule.watch; - if (watch) { - struct audit_parent *parent = watch->parent; - - list_del(&e->rule.rlist); - - if (list_empty(&watch->rules)) { - audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. - */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, &inotify_list); - } - } - } + if (e->rule.watch) + audit_remove_watch_rule(&e->rule, &inotify_list); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -1430,8 +1001,8 @@ static inline int audit_del_rule(struct audit_entry *entry) audit_inotify_unregister(&inotify_list); out: - if (tmp_watch) - audit_put_watch(tmp_watch); /* match initial get */ + if (watch) + audit_put_watch(watch); /* match initial get */ if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1785,7 +1356,7 @@ static int update_lsm_rule(struct audit_krule *r) list_del(&r->list); } else { if (watch) { - list_add(&nentry->rule.rlist, &watch->rules); + list_add(&nentry->rule.rlist, audit_watch_rules(watch)); list_del(&r->rlist); } else if (tree) list_replace_init(&r->rlist, &nentry->rule.rlist); @@ -1821,27 +1392,3 @@ int audit_update_lsm_rules(void) return err; } - -/* Update watch data in audit rules based on inotify events. */ -void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { - audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); -} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b14d234b85f3..0b862cac6ca2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -548,9 +548,9 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && rule->watch->ino != (unsigned long)-1) - result = (name->dev == rule->watch->dev && - name->ino == rule->watch->ino); + if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) + result = (name->dev == audit_watch_dev(rule->watch) && + name->ino == audit_watch_inode(rule->watch)); break; case AUDIT_DIR: if (ctx) -- cgit v1.2.3 From 35fe4d0b1b12286a81938e9c5fdfaf639ac0ce5b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:36 -0400 Subject: Audit: move audit_get_nd completely into audit_watch audit_get_nd() is only used by audit_watch and could be more cleanly implemented by having the audit watch functions call it when needed rather than making the generic audit rule parsing code deal with those objects. Signed-off-by: Eric Paris --- kernel/audit.h | 5 +---- kernel/audit_watch.c | 27 ++++++++++++++++++++------- kernel/auditfilter.c | 15 ++------------- 3 files changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 704d5b01d9fd..bb1c0d69db08 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -109,10 +109,7 @@ extern dev_t audit_watch_dev(struct audit_watch *watch); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); -extern int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw); -extern void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw); -extern int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw); +extern int audit_add_watch(struct audit_krule *krule); extern void audit_remove_watch(struct audit_watch *watch); extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); extern void audit_inotify_unregister(struct list_head *in_list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index da8be6d39c1a..b49ab019fdff 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -345,7 +345,7 @@ void audit_inotify_unregister(struct list_head *in_list) } /* Get path information necessary for adding watches. */ -int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) { struct nameidata *ndparent, *ndwatch; int err; @@ -380,7 +380,7 @@ int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) } /* Release resources used for watch path information. */ -void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) { if (ndp) { path_put(&ndp->path); @@ -426,14 +426,24 @@ static void audit_add_to_parent(struct audit_krule *krule, /* Find a matching watch entry, or add this one. * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) +int audit_add_watch(struct audit_krule *krule) { struct audit_watch *watch = krule->watch; struct inotify_watch *i_watch; struct audit_parent *parent; + struct nameidata *ndp = NULL, *ndw = NULL; int ret = 0; + mutex_unlock(&audit_filter_mutex); + + /* Avoid calling path_lookup under audit_filter_mutex. */ + ret = audit_get_nd(watch->path, &ndp, &ndw); + if (ret) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + goto error; + } + /* update watch filter fields */ if (ndw) { watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; @@ -445,15 +455,14 @@ int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, * inotify watch is found, inotify_find_watch() grabs a reference before * returning. */ - mutex_unlock(&audit_filter_mutex); - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, &i_watch) < 0) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); + ret = PTR_ERR(parent); + goto error; } } else parent = container_of(i_watch, struct audit_parent, wdata); @@ -468,7 +477,11 @@ int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, /* match get in audit_init_parent or inotify_find_watch */ put_inotify_watch(&parent->wdata); + +error: + audit_put_nd(ndp, ndw); /* NULL args OK */ return ret; + } void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9d4c93437de6..21b623595aad 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -864,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_entry *e; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; - struct nameidata *ndp = NULL, *ndw = NULL; struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL @@ -878,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); - mutex_unlock(&audit_filter_mutex); if (e) { + mutex_unlock(&audit_filter_mutex); err = -EEXIST; /* normally audit_add_tree_rule() will free it on failure */ if (tree) @@ -887,17 +886,9 @@ static inline int audit_add_rule(struct audit_entry *entry) goto error; } - /* Avoid calling path_lookup under audit_filter_mutex. */ - if (watch) { - err = audit_get_nd(audit_watch_path(watch), &ndp, &ndw); - if (err) - goto error; - } - - mutex_lock(&audit_filter_mutex); if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule, ndp, ndw); + err = audit_add_watch(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); goto error; @@ -942,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - audit_put_nd(ndp, ndw); /* NULL args OK */ return 0; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ if (watch) audit_put_watch(watch); /* tmp watch, matches initial get */ return err; -- cgit v1.2.3 From 9d9609851003ebed15957f0f2ce18492739ee124 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:37 -0400 Subject: Audit: clean up all op= output to include string quoting A number of places in the audit system we send an op= followed by a string that includes spaces. Somehow this works but it's just wrong. This patch moves all of those that I could find to be quoted. Example: Change From: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op=remove rule key="number2" list=4 res=0 Change To: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op="remove rule" key="number2" list=4 res=0 Signed-off-by: Eric Paris --- include/linux/audit.h | 3 +++ kernel/audit.c | 9 +++++++++ kernel/audit_tree.c | 10 ++++------ kernel/audit_watch.c | 6 +----- kernel/auditfilter.c | 12 +++++------- kernel/auditsc.c | 8 ++------ 6 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 4fa2810b675e..3c7a358241a7 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -599,6 +599,8 @@ extern void audit_log_untrustedstring(struct audit_buffer *ab, extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct path *path); +extern void audit_log_key(struct audit_buffer *ab, + char *key); extern void audit_log_lost(const char *message); extern int audit_update_lsm_rules(void); @@ -621,6 +623,7 @@ extern int audit_enabled; #define audit_log_n_untrustedstring(a,n,s) do { ; } while (0) #define audit_log_untrustedstring(a,s) do { ; } while (0) #define audit_log_d_path(b, p, d) do { ; } while (0) +#define audit_log_key(b, k) do { ; } while (0) #define audit_enabled 0 #endif #endif diff --git a/kernel/audit.c b/kernel/audit.c index e07ad2340dbe..6194c50e2039 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1450,6 +1450,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_key(struct audit_buffer *ab, char *key) +{ + audit_log_format(ab, " key="); + if (key) + audit_log_untrustedstring(ab, key); + else + audit_log_format(ab, "(null)"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1f6396d76687..3ff0731284a1 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -441,13 +441,11 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule dir="); + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); - if (rule->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, rule->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); audit_log_end(ab); rule->tree = NULL; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index b49ab019fdff..0e96dbc60ea9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -234,11 +234,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc audit_log_string(ab, op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, r->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, r->filterkey); audit_log_format(ab, " list=%d res=1", r->listnr); audit_log_end(ab); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 21b623595aad..a70604047f3c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1079,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, security_release_secctx(ctx, len); } } - audit_log_format(ab, " op=%s rule key=", action); - if (rule->filterkey) - audit_log_untrustedstring(ab, rule->filterkey); - else - audit_log_format(ab, "(null)"); + audit_log_format(ab, " op="); + audit_log_string(ab, action); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); } @@ -1147,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add", + audit_log_rule_change(loginuid, sessionid, sid, "add rule", &entry->rule, !err); if (err) @@ -1163,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove", + audit_log_rule_change(loginuid, sessionid, sid, "remove rule", &entry->rule, !err); audit_free_rule(entry); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0b862cac6ca2..2de95d1582bc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1137,7 +1137,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (has_cntl) audit_log_n_hex(*ab, buf, to_send); else - audit_log_format(*ab, "\"%s\"", buf); + audit_log_string(*ab, buf); p += to_send; len_left -= to_send; @@ -1372,11 +1372,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_task_info(ab, tsk); - if (context->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, context->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, context->filterkey); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { -- cgit v1.2.3 From 916d75761c971b6e630a26bd4ba472e90ac9a4b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 24 Jun 2009 00:02:38 -0400 Subject: Fix rule eviction order for AUDIT_DIR If syscall removes the root of subtree being watched, we definitely do not want the rules refering that subtree to be destroyed without the syscall in question having a chance to match them. Signed-off-by: Al Viro --- kernel/audit.c | 17 +--------------- kernel/audit.h | 7 +++++-- kernel/audit_tree.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/auditsc.c | 15 ++++++++++++++ 4 files changed, 72 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6194c50e2039..defc2e6f1e3b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -133,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* Serialize requests from userspace. */ -static DEFINE_MUTEX(audit_cmd_mutex); +DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -505,21 +505,6 @@ int audit_send_list(void *_dest) return 0; } -#ifdef CONFIG_AUDIT_TREE -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - audit_prune_trees(); - mutex_unlock(&audit_cmd_mutex); - return 0; -} - -void audit_schedule_prune(void) -{ - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); -} -#endif - struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { diff --git a/kernel/audit.h b/kernel/audit.h index bb1c0d69db08..208687be4f30 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -128,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern void audit_schedule_prune(void); -extern void audit_prune_trees(void); extern const char *audit_tree_path(struct audit_tree *); extern void audit_put_tree(struct audit_tree *); +extern void audit_kill_trees(struct list_head *); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -140,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ +#define audit_kill_trees(list) BUG() #endif extern char *audit_unpack_string(void **, size_t *, size_t); @@ -158,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif + +extern struct mutex audit_cmd_mutex; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3ff0731284a1..2451dc6f3282 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -2,6 +2,7 @@ #include #include #include +#include struct audit_tree; struct audit_chunk; @@ -517,6 +518,8 @@ static void trim_marked(struct audit_tree *tree) } } +static void audit_schedule_prune(void); + /* called with audit_filter_mutex */ int audit_remove_tree_rule(struct audit_krule *rule) { @@ -822,10 +825,11 @@ int audit_tag_tree(char *old, char *new) /* * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread, with audit_cmd_mutex held. + * Runs from a separate thread. */ -void audit_prune_trees(void) +static int prune_tree_thread(void *unused) { + mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -842,6 +846,40 @@ void audit_prune_trees(void) } mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + return 0; +} + +static void audit_schedule_prune(void) +{ + kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); +} + +/* + * ... and that one is done if evict_chunk() decides to delay until the end + * of syscall. Runs synchronously. + */ +void audit_kill_trees(struct list_head *list) +{ + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(list)) { + struct audit_tree *victim; + + victim = list_entry(list->next, struct audit_tree, list); + kill_rules(victim); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); } /* @@ -852,6 +890,8 @@ void audit_prune_trees(void) static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; + struct list_head *postponed = audit_killed_trees(); + int need_prune = 0; int n; if (chunk->dead) @@ -867,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk) owner->root = NULL; list_del_init(&owner->same_root); spin_unlock(&hash_lock); - kill_rules(owner); - list_move(&owner->list, &prune_list); - audit_schedule_prune(); + if (!postponed) { + kill_rules(owner); + list_move(&owner->list, &prune_list); + need_prune = 1; + } else { + list_move(&owner->list, postponed); + } spin_lock(&hash_lock); } list_del_rcu(&chunk->hash); for (n = 0; n < chunk->count; n++) list_del_init(&chunk->owners[n].list); spin_unlock(&hash_lock); + if (need_prune) + audit_schedule_prune(); mutex_unlock(&audit_filter_mutex); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2de95d1582bc..68d3c6a0ecd6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -199,6 +199,7 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + struct list_head killed_trees; int type; union { @@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) return NULL; audit_zero_context(context, state); + INIT_LIST_HEAD(&context->killed_trees); return context; } @@ -1545,6 +1547,8 @@ void audit_free(struct task_struct *tsk) /* that can happen only if we are called from do_exit() */ if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); audit_free_context(context); } @@ -1688,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); + if (context->previous) { struct audit_context *new_context = context->previous; context->previous = NULL; @@ -2521,3 +2528,11 @@ void audit_core_dumps(long signr) audit_log_format(ab, " sig=%ld", signr); audit_log_end(ab); } + +struct list_head *audit_killed_trees(void) +{ + struct audit_context *ctx = current->audit_context; + if (likely(!ctx || !ctx->in_syscall)) + return NULL; + return &ctx->killed_trees; +} -- cgit v1.2.3 From e1c7e2a6e67fe9db19dd15e71614526a31b5fdb1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:29 +0800 Subject: tracing/events: Don't increment @pos in s_start() While testing syscall tracepoints posted by Jason, I found 3 entries were missing when reading available_events. The output size of available_events is < 4 pages, which means we lost 1 entry per page. The cause is, it's wrong to increment @pos in s_start(). Actually there's another bug here -- reading avaiable_events/set_events can race with module unload: # cat available_events | s_start() | s_stop() | | # rmmod foo.ko s_start() | call = list_entry(m->private) | @call might be freed and accessing it will lead to crash. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186DD.6090405@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b6..53c8fd376a88 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return t_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = t_next(m, NULL, &l); + if (!call) + break; + } + return call; } static void * @@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return s_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = s_next(m, NULL, &l); + if (!call) + break; + } + return call; } static int t_show(struct seq_file *m, void *v) -- cgit v1.2.3 From c8961ec6da22ea010bf4470a8e0fb3fdad0f11c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:58 +0800 Subject: tracing_bprintk: Don't increment @pos in t_start() It's wrong to increment @pos in t_start(), otherwise we'll lose some entries when reading printk_formats, if the output is larger than PAGE_SIZE. Reported-by: Lai Jiangshan Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186FA.1020106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_printk.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9bece9687b62..7b6278110827 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) EXPORT_SYMBOL_GPL(__ftrace_vprintk); static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = m->private; - const char **next = fmt; - - (*pos)++; + const char **fmt = __start___trace_bprintk_fmt + *pos; if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) return NULL; - - next = fmt; - m->private = ++next; - return fmt; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void *t_next(struct seq_file *m, void * v, loff_t *pos) { - return t_next(m, NULL, pos); + (*pos)++; + return t_start(m, pos); } static int t_show(struct seq_file *m, void *v) @@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &show_format_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start___trace_bprintk_fmt; - } - return ret; + return seq_open(file, &show_format_seq_ops); } static const struct file_operations ftrace_formats_fops = { -- cgit v1.2.3 From 2961bf345fd1b736c3db46cad0f69855f67fbe9c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:26 +0800 Subject: trace_stat: Don't increment @pos in seq start() It's wrong to increment @pos in stat_seq_start(). It causes some stat entries lost when reading stat file, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418716.90209@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index c00643733f4c..e66f5e493342 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) { - (*pos)++; + if (!*pos && session->ts->stat_headers) return SEQ_START_TOKEN; - } node = rb_first(&session->stat_root); for (i = 0; node && i < *pos; i++) node = rb_next(node); - (*pos)++; - return node; } -- cgit v1.2.3 From f129e965bef40c6153e4fe505f1e408286213424 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:44 +0800 Subject: tracing: Reset iterator in t_start() The iterator is m->private, but it's not reset to trace_types in t_start(). If the output is larger than PAGE_SIZE and t_start() is called the 2nd time, things will go wrong. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418728.5020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 076fa6f0ee48..3bb31006b5cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,25 +2053,23 @@ static int tracing_open(struct inode *inode, struct file *file) static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t = v; (*pos)++; if (t) t = t->next; - m->private = t; - return t; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) ; return t; @@ -2107,18 +2105,10 @@ static struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - int ret; - if (tracing_disabled) return -ENODEV; - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; + return seq_open(file, &show_traces_seq_ops); } static ssize_t -- cgit v1.2.3 From 85951842a1020669f0a9eb0f0d1853b41341f097 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:00 +0800 Subject: ftrace: Don't increment @pos in g_start() It's wrong to increment @pos in g_start(). It causes some entries lost when reading set_graph_function, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418738.7090401@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3718d55fb4c3..cde74b9973b7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2500,32 +2500,31 @@ int ftrace_graph_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * -g_next(struct seq_file *m, void *v, loff_t *pos) +__g_next(struct seq_file *m, loff_t *pos) { unsigned long *array = m->private; - int index = *pos; - - (*pos)++; - if (index >= ftrace_graph_count) + if (*pos >= ftrace_graph_count) return NULL; + return &array[*pos]; +} - return &array[index]; +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); } static void *g_start(struct seq_file *m, loff_t *pos) { - void *p = NULL; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ if (!ftrace_graph_count && !*pos) return (void *)1; - p = g_next(m, p, pos); - - return p; + return __g_next(m, pos); } static void g_stop(struct seq_file *m, void *p) -- cgit v1.2.3 From 694ce0a544fba37a60025a6803ee6265be8a2a22 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:19 +0800 Subject: ftrace: Don't manipulate @pos in t_start() It's rather confusing that in t_start(), in some cases @pos is incremented, and in some cases it's decremented and then incremented. This patch rewrites t_start() in a much more general way. Thus we fix a bug that if ftrace_filtered == 1, functions have tracer hooks won't be printed, because the branch is always unreachable: static void *t_start(...) { ... if (!p) return t_hash_start(m, pos); return p; } Before: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open After: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open sys_write:traceon:count=4 Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41874B.4090507@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cde74b9973b7..dc810208edde 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1467,8 +1467,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; - } else { - iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -1497,6 +1495,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; mutex_lock(&ftrace_lock); /* @@ -1508,23 +1507,21 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; - (*pos)++; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); - if (*pos > 0) { - if (iter->idx < 0) - return p; - (*pos)--; - iter->idx--; + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; } - p = t_next(m, p, pos); - - if (!p) + if (!p && iter->flags & FTRACE_ITER_FILTER) return t_hash_start(m, pos); return p; -- cgit v1.2.3 From d82d62444f87e5993af2fa82ed636b2206e052ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:54 +0800 Subject: ftrace: Fix t_hash_start() When the output of set_ftrace_filter is larger than PAGE_SIZE, t_hash_start() will be called the 2nd time, and then we start from the head of a hlist, which is wrong and causes some entries to be outputed twice. The worse is, if the hlist is large enough, reading set_ftrace_filter won't stop but in a dead loop. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41876E.2060407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dc810208edde..71a52c172140 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1417,10 +1417,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_HASH)) + *pos = 0; iter->flags |= FTRACE_ITER_HASH; - return t_hash_next(m, p, pos); + iter->hidx = 0; + for (l = 0; l <= *pos; ) { + p = t_hash_next(m, p, &l); + if (!p) + break; + } + return p; } static int t_hash_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 507e123151149e578c9aae33eb876c49824da5f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jun 2009 17:38:15 +0200 Subject: timer stats: Optimize by adding quick check to avoid function calls When the kernel is configured with CONFIG_TIMER_STATS but timer stats are runtime disabled we still get calls to __timer_stats_timer_set_start_info which initializes some fields in the corresponding struct timer_list. So add some quick checks in the the timer stats setup functions to avoid function calls to __timer_stats_timer_set_start_info when timer stats are disabled. In an artificial workload that does nothing but playing ping pong with a single tcp packet via loopback this decreases cpu consumption by 1 - 1.5%. This is part of a modified function trace output on SLES11: perl-2497 [00] 28630647177732388 [+ 125]: sk_reset_timer <-tcp_v4_rcv perl-2497 [00] 28630647177732513 [+ 125]: mod_timer <-sk_reset_timer perl-2497 [00] 28630647177732638 [+ 125]: __timer_stats_timer_set_start_info <-mod_timer perl-2497 [00] 28630647177732763 [+ 125]: __mod_timer <-mod_timer perl-2497 [00] 28630647177732888 [+ 125]: __timer_stats_timer_set_start_info <-__mod_timer perl-2497 [00] 28630647177733013 [+ 93]: lock_timer_base <-__mod_timer Signed-off-by: Heiko Carstens Cc: Andrew Morton Cc: Martin Schwidefsky Cc: Mustafa Mesanovic Cc: Arjan van de Ven LKML-Reference: <20090623153811.GA4641@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 5 +++++ include/linux/timer.h | 4 ++++ kernel/time/timer_stats.c | 16 ++++++++-------- kernel/timer.c | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7400900de94a..54648e625efd 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -21,6 +21,7 @@ #include #include #include +#include struct hrtimer_clock_base; @@ -447,6 +448,8 @@ extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static inline void timer_stats_account_hrtimer(struct hrtimer *timer) { + if (likely(!timer->start_pid)) + return; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, timer->function, timer->start_comm, 0); } @@ -456,6 +459,8 @@ extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { + if (likely(!timer_stats_active)) + return; __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0)); } diff --git a/include/linux/timer.h b/include/linux/timer.h index ccf882eed8f8..be62ec2ebea5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -190,6 +190,8 @@ extern unsigned long get_next_timer_interrupt(unsigned long now); */ #ifdef CONFIG_TIMER_STATS +extern int timer_stats_active; + #define TIMER_STATS_FLAG_DEFERRABLE 0x1 extern void init_timer_stats(void); @@ -203,6 +205,8 @@ extern void __timer_stats_timer_set_start_info(struct timer_list *timer, static inline void timer_stats_timer_set_start_info(struct timer_list *timer) { + if (likely(!timer_stats_active)) + return; __timer_stats_timer_set_start_info(timer, __builtin_return_address(0)); } diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c994530d166d..4cde8b9c716f 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex); /* * Collection status, active/inactive: */ -static int __read_mostly active; +int __read_mostly timer_stats_active; /* * Beginning/end timestamps of measurement: @@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, struct entry *entry, input; unsigned long flags; - if (likely(!active)) + if (likely(!timer_stats_active)) return; lock = &per_cpu(lookup_lock, raw_smp_processor_id()); @@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); - if (!active) + if (!timer_stats_active) goto out_unlock; entry = tstat_lookup(&input, comm); @@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v) /* * If still active then calculate up to now: */ - if (active) + if (timer_stats_active) time_stop = ktime_get(); time = ktime_sub(time_stop, time_start); @@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf, mutex_lock(&show_mutex); switch (ctl[0]) { case '0': - if (active) { - active = 0; + if (timer_stats_active) { + timer_stats_active = 0; time_stop = ktime_get(); sync_access(); } break; case '1': - if (!active) { + if (!timer_stats_active) { reset_entries(); time_start = ktime_get(); smp_mb(); - active = 1; + timer_stats_active = 1; } break; default: diff --git a/kernel/timer.c b/kernel/timer.c index 54d3912f8cad..0b36b9e5cc8b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer) { unsigned int flag = 0; + if (likely(!timer->start_site)) + return; if (unlikely(tbase_get_deferrable(timer->base))) flag |= TIMER_STATS_FLAG_DEFERRABLE; -- cgit v1.2.3 From 9d612beff5089b89a295a2331883a8ce3fff08c1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 17:33:15 +0800 Subject: tracing: Fix trace_buf_size boot option We should be able to specify [KMG] when setting trace_buf_size boot option, as documented in kernel-parameters.txt Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41F2DB.4020102@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 ++- kernel/trace/trace.c | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 92e1ab8178a8..d3f41db3ed49 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2475,7 +2475,8 @@ and is between 256 and 4096 characters. It is defined in the file tp720= [HW,PS2] - trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size. + trace_buf_size=nn[KMG] + [FTRACE] will set tracing buffer size. trix= [HW,OSS] MediaTrix AudioTrix Pro Format: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3bb31006b5cc..3aa0a0dfdfa8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -284,13 +284,12 @@ void trace_wake_up(void) static int __init set_buf_size(char *str) { unsigned long buf_size; - int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &buf_size); + buf_size = memparse(str, &str); /* nr_entries can not be zero */ - if (ret < 0 || buf_size == 0) + if (buf_size == 0) return 0; trace_buf_size = buf_size; return 1; -- cgit v1.2.3 From d0725992c8a6fb63a16bc9e8b2a50094cc4db3cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Jun 2009 23:15:43 +0200 Subject: futex: Fix the write access fault problem for real commit 64d1304a64 (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/futex.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 80b5ce716596..1c337112335c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/* + * fault_in_user_writeable - fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, + sizeof(*uaddr), 1, 0, NULL, NULL); + return ret < 0 ? ret : 0; +} + /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in @@ -896,7 +915,6 @@ retry: retry_private: op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - u32 dummy; double_unlock_hb(hb1, hb2); @@ -914,7 +932,7 @@ retry_private: goto out_put_keys; } - ret = get_user(dummy, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (ret) goto out_put_keys; @@ -1204,7 +1222,7 @@ retry_private: double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - ret = get_user(curval2, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; goto out; @@ -1482,7 +1500,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); @@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - u32 uval; struct futex_q q; int res, ret; @@ -1909,16 +1926,9 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ queue_unlock(&q, hb); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (ret) goto out_put_key; @@ -2013,17 +2023,10 @@ out: return ret; pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ spin_unlock(&hb->lock); put_futex_key(fshared, &key); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (!ret) goto retry; -- cgit v1.2.3 From 3a6a6c16be78472a52f6dd7d88913373b42ad0f7 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 24 Jun 2009 16:09:01 -0400 Subject: audit: inode watches depend on CONFIG_AUDIT not CONFIG_AUDIT_SYSCALL Even though one cannot make use of the audit watch code without CONFIG_AUDIT_SYSCALL the spaghetti nature of the audit code means that the audit rule filtering requires that it at least be compiled. Thus build the audit_watch code when we build auditfilter like it was before cfcad62c74abfef83762dc05a556d21bdf3980a2 Clearly this is a point of potential future cleanup.. Reported-by: Frans Pop Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index da750010a6fc..780c8dcf4516 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,8 +69,8 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o -- cgit v1.2.3 From 00e54d087afb3867b0b461aef6c1ff433d0df564 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 25 Jun 2009 14:05:27 +0800 Subject: ftrace: Remove duplicate newline Before: # echo 'sys_open:traceon:' > set_ftrace_filter # echo 'sys_close:traceoff:5' > set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 After: # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4313A7.7030105@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 90f134764837..7402144bff21 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, if (count == -1) seq_printf(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld", count); - seq_putc(m, '\n'); + seq_printf(m, ":count=%ld\n", count); return 0; } -- cgit v1.2.3 From 1155de47cd66d0c496d5a6fb2223e980ef1285b2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 25 Jun 2009 14:30:12 +0900 Subject: ring-buffer: Make it generally available In hunting down the cause for the hwlat_detector ring buffer spew in my failed -next builds it became obvious that folks are now treating ring_buffer as something that is generic independent of tracing and thus, suitable for public driver consumption. Given that there are only a few minor areas in ring_buffer that have any reliance on CONFIG_TRACING or CONFIG_FUNCTION_TRACER, provide stubs for those and make it generally available. Signed-off-by: Paul Mundt Cc: Jon Masters Cc: Steven Rostedt LKML-Reference: <20090625053012.GB19944@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.h | 7 +++++++ 3 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec97..0630e293cd49 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04dac2638258..bf27bb7a63e2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1563,6 +1563,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) @@ -1593,6 +1595,13 @@ static void trace_recursive_unlock(void) current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -3104,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3171,6 +3181,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4771f8..3548ae5cc780 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter) extern struct pid *ftrace_pid_trace; +#ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { if (!ftrace_pid_trace) @@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif /* * trace_iterator_flags is an enumeration that defines bit -- cgit v1.2.3 From aa715284b4d28cabde6c25c568d769a6be712bc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Jun 2009 14:27:58 +0200 Subject: futex: request only one page from get_user_pages() Yanmin noticed that fault_in_user_writeable() requests 4 pages instead of one. That's the result of blindly trusting Linus' proposal :) I even looked up the prototype to verify the correctness: the argument in question is confusingly enough named "len" while in reality it means number of pages. Pointed-out-by: Yanmin Zhang Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1c337112335c..794c862125fe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -299,7 +299,7 @@ void put_futex_key(int fshared, union futex_key *key) static int fault_in_user_writeable(u32 __user *uaddr) { int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - sizeof(*uaddr), 1, 0, NULL, NULL); + 1, 1, 0, NULL, NULL); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 7f8b4e4e0988dadfd22330fd147ad2453e19f510 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 14:34:35 +0200 Subject: perf_counter: Add scale information to the mmap control page Add the needed time scale to the self-profile mmap information. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++- kernel/perf_counter.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 489d5cbfbcca..bcbf1c43ed42 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -232,12 +232,14 @@ struct perf_counter_mmap_page { __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + __u64 time_enabled; /* time counter active */ + __u64 time_running; /* time counter on cpu */ /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[125]; /* align to 1k */ + __u64 __reserved[123]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c2b19c111718..23614adab475 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1782,6 +1782,12 @@ void perf_counter_update_userpage(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + userpg->time_enabled = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + + userpg->time_running = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + barrier(); ++userpg->lock; preempt_enable(); -- cgit v1.2.3 From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 16:35:24 +0200 Subject: perf_counter, x86: Add mmap counter read support Update the mmap control page with the needed information to use the userspace RDPMC instruction for self monitoring. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 2 ++ arch/x86/include/asm/perf_counter.h | 3 +++ arch/x86/kernel/cpu/perf_counter.c | 6 ++++++ kernel/perf_counter.c | 10 +++++++++- 4 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 8ccd4e155768..0ea0639fcf75 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -61,6 +61,8 @@ struct pt_regs; extern unsigned long perf_misc_flags(struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +#define PERF_COUNTER_INDEX_OFFSET 1 + /* * Only override the default definitions in include/linux/perf_counter.h * if we have hardware PMU support. diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 5fb33e160ea0..fa64e401589d 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -87,6 +87,9 @@ union cpuid10_edx { #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); + +#define PERF_COUNTER_INDEX_OFFSET 0 + #else static inline void init_hw_perf_counters(void) { } static inline void perf_counters_lapic_init(void) { } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a310d19faca3..b83474b6021a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -912,6 +912,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -1034,6 +1036,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1126,6 +1130,8 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 23614adab475..02994a719e27 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1753,6 +1753,14 @@ int perf_counter_task_disable(void) return 0; } +static int perf_counter_index(struct perf_counter *counter) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -1777,7 +1785,7 @@ void perf_counter_update_userpage(struct perf_counter *counter) preempt_disable(); ++userpg->lock; barrier(); - userpg->index = counter->hw.idx; + userpg->index = perf_counter_index(counter); userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); -- cgit v1.2.3 From 38b200d67636a30cb8dc1508137908e7a649b5c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Jun 2009 20:13:11 +0200 Subject: perf_counter: Add PERF_EVENT_READ Provide a read() like event which can be used to log the counter value at specific sites such as child->parent folding on exit. In order to be useful, we log the counter parent ID, not the actual counter ID, since userspace can only relate parent IDs to perf_counter_attr constructs. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 ++++++++ kernel/perf_counter.c | 72 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 80 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index bcbf1c43ed42..6a384f04755a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -334,6 +334,18 @@ enum perf_event_type { */ PERF_EVENT_FORK = 7, + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 parent_id; } && PERF_FORMAT_ID + * }; + */ + PERF_EVENT_READ = 8, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_SAMPLE_* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 02994a719e27..a72c20e91953 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2623,6 +2623,66 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, perf_output_end(&handle); } +/* + * read event + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 value; + u64 format[3]; +}; + +static void +perf_counter_read_event(struct perf_counter *counter, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event event = { + .header = { + .type = PERF_EVENT_READ, + .misc = 0, + .size = sizeof(event) - sizeof(event.format), + }, + .pid = perf_counter_pid(counter, task), + .tid = perf_counter_tid(counter, task), + .value = atomic64_read(&counter->count), + }; + int ret, i = 0; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + event.header.size += sizeof(u64); + event.format[i++] = counter->total_time_enabled; + } + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + event.header.size += sizeof(u64); + event.format[i++] = counter->total_time_running; + } + + if (counter->attr.read_format & PERF_FORMAT_ID) { + u64 id; + + event.header.size += sizeof(u64); + if (counter->parent) + id = counter->parent->id; + else + id = counter->id; + + event.format[i++] = id; + } + + ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + if (ret) + return; + + perf_output_copy(&handle, &event, event.header.size); + perf_output_end(&handle); +} + /* * fork tracking */ @@ -3985,10 +4045,13 @@ static int inherit_group(struct perf_counter *parent_counter, } static void sync_child_counter(struct perf_counter *child_counter, - struct perf_counter *parent_counter) + struct task_struct *child) { + struct perf_counter *parent_counter = child_counter->parent; u64 child_val; + perf_counter_read_event(child_counter, child); + child_val = atomic64_read(&child_counter->count); /* @@ -4017,7 +4080,8 @@ static void sync_child_counter(struct perf_counter *child_counter, static void __perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx) + struct perf_counter_context *child_ctx, + struct task_struct *child) { struct perf_counter *parent_counter; @@ -4031,7 +4095,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter, * counters need to be zapped - but otherwise linger. */ if (parent_counter) { - sync_child_counter(child_counter, parent_counter); + sync_child_counter(child_counter, child); free_counter(child_counter); } } @@ -4093,7 +4157,7 @@ void perf_counter_exit_task(struct task_struct *child) again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) - __perf_counter_exit_task(child_counter, child_ctx); + __perf_counter_exit_task(child_counter, child_ctx, child); /* * If the last counter was a group counter, it will have appended all -- cgit v1.2.3 From bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Jun 2009 21:11:59 +0200 Subject: perf_counter: Implement more accurate per task statistics With the introduction of PERF_EVENT_READ we have the possibility to provide accurate counter values for individual tasks in a task hierarchy. However, due to the lazy context switching used for similar counter contexts our current per task counts are way off. In order to maintain some of the lazy switch benefits we don't disable it out-right, but simply iterate the active counters and flip the values between the contexts. This only reads the counters but does not need to reprogram the full PMU. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++- kernel/perf_counter.c | 83 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6a384f04755a..de70a10b5ec8 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -178,8 +178,9 @@ struct perf_counter_attr { mmap : 1, /* include mmap data */ comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ - __reserved_1 : 53; + __reserved_1 : 52; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -602,6 +603,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + int nr_stat; atomic_t refcount; struct task_struct *task; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a72c20e91953..385ca51c6e60 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->attr.inherit_stat) + ctx->nr_stat++; } /* @@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->attr.inherit_stat) + ctx->nr_stat--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -1006,6 +1010,76 @@ static int context_equiv(struct perf_counter_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } +static void __perf_counter_read(void *counter); + +static void __perf_counter_sync_stat(struct perf_counter *counter, + struct perf_counter *next_counter) +{ + u64 value; + + if (!counter->attr.inherit_stat) + return; + + /* + * Update the counter value, we cannot use perf_counter_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the counter must be on the current CPU, therefore we + * don't need to use it. + */ + switch (counter->state) { + case PERF_COUNTER_STATE_ACTIVE: + __perf_counter_read(counter); + break; + + case PERF_COUNTER_STATE_INACTIVE: + update_counter_times(counter); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the counter + * values when we flip the contexts. + */ + value = atomic64_read(&next_counter->count); + value = atomic64_xchg(&counter->count, value); + atomic64_set(&next_counter->count, value); + + /* + * XXX also sync time_enabled and time_running ? + */ +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_counter_sync_stat(struct perf_counter_context *ctx, + struct perf_counter_context *next_ctx) +{ + struct perf_counter *counter, *next_counter; + + if (!ctx->nr_stat) + return; + + counter = list_first_entry(&ctx->event_list, + struct perf_counter, event_entry); + + next_counter = list_first_entry(&next_ctx->event_list, + struct perf_counter, event_entry); + + while (&counter->event_entry != &ctx->event_list && + &next_counter->event_entry != &next_ctx->event_list) { + + __perf_counter_sync_stat(counter, next_counter); + + counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(counter, event_entry); + } +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -1061,6 +1135,8 @@ void perf_counter_task_sched_out(struct task_struct *task, ctx->task = next; next_ctx->task = task; do_switch = 0; + + perf_counter_sync_stat(ctx, next_ctx); } spin_unlock(&next_ctx->lock); spin_unlock(&ctx->lock); @@ -1350,7 +1426,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Cross CPU call to read the hardware counter */ -static void __read(void *info) +static void __perf_counter_read(void *info) { struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; @@ -1372,7 +1448,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __read, counter, 1); + __perf_counter_read, counter, 1); } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); } @@ -4050,7 +4126,8 @@ static void sync_child_counter(struct perf_counter *child_counter, struct perf_counter *parent_counter = child_counter->parent; u64 child_val; - perf_counter_read_event(child_counter, child); + if (child_counter->attr.inherit_stat) + perf_counter_read_event(child_counter, child); child_val = atomic64_read(&child_counter->count); -- cgit v1.2.3 From e6e18ec79b023d5fe84226cef533cf0e3770ce93 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2009 11:27:12 +0200 Subject: perf_counter: Rework the sample ABI The PERF_EVENT_READ implementation made me realize we don't actually need the sample_type int the output sample, since we already have that in the perf_counter_attr information. Therefore, remove the PERF_EVENT_MISC_OVERFLOW bit and the event->type overloading, and imply put counter overflow samples in a PERF_EVENT_SAMPLE type. This also fixes the issue that event->type was only 32-bit and sample_type had 64 usable bits. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 36 +++++++++++++++--------------------- tools/perf/builtin-annotate.c | 8 ++++---- tools/perf/builtin-report.c | 32 +++++++++++++++++++------------- tools/perf/builtin-top.c | 11 ++++++----- 5 files changed, 49 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index de70a10b5ec8..3078e23c91eb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -262,7 +262,6 @@ struct perf_counter_mmap_page { #define PERF_EVENT_MISC_KERNEL (1 << 0) #define PERF_EVENT_MISC_USER (2 << 0) #define PERF_EVENT_MISC_HYPERVISOR (3 << 0) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -348,9 +347,6 @@ enum perf_event_type { PERF_EVENT_READ = 8, /* - * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field - * will be PERF_SAMPLE_* - * * struct { * struct perf_event_header header; * @@ -358,8 +354,9 @@ enum perf_event_type { * { u32 pid, tid; } && PERF_SAMPLE_TID * { u64 time; } && PERF_SAMPLE_TIME * { u64 addr; } && PERF_SAMPLE_ADDR - * { u64 config; } && PERF_SAMPLE_CONFIG + * { u64 id; } && PERF_SAMPLE_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD * * { u64 nr; * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP @@ -368,6 +365,9 @@ enum perf_event_type { * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN * }; */ + PERF_EVENT_SAMPLE = 9, + + PERF_EVENT_MAX, /* non-ABI */ }; enum perf_callchain_context { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 385ca51c6e60..f2f232696587 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2575,15 +2575,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u32 cpu, reserved; } cpu_entry; - header.type = 0; + header.type = PERF_EVENT_SAMPLE; header.size = sizeof(header); - header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc = 0; header.misc |= perf_misc_flags(data->regs); if (sample_type & PERF_SAMPLE_IP) { ip = perf_instruction_pointer(data->regs); - header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } @@ -2592,7 +2591,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, tid_entry.pid = perf_counter_pid(counter, current); tid_entry.tid = perf_counter_tid(counter, current); - header.type |= PERF_SAMPLE_TID; header.size += sizeof(tid_entry); } @@ -2602,34 +2600,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, */ time = sched_clock(); - header.type |= PERF_SAMPLE_TIME; header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_ADDR) { - header.type |= PERF_SAMPLE_ADDR; + if (sample_type & PERF_SAMPLE_ADDR) header.size += sizeof(u64); - } - if (sample_type & PERF_SAMPLE_ID) { - header.type |= PERF_SAMPLE_ID; + if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); - } if (sample_type & PERF_SAMPLE_CPU) { - header.type |= PERF_SAMPLE_CPU; header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); } - if (sample_type & PERF_SAMPLE_PERIOD) { - header.type |= PERF_SAMPLE_PERIOD; + if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - } if (sample_type & PERF_SAMPLE_GROUP) { - header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } @@ -2639,10 +2628,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - - header.type |= PERF_SAMPLE_CALLCHAIN; header.size += callchain_size; - } + } else + header.size += sizeof(u64); } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2693,8 +2681,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + else { + u64 nr = 0; + perf_output_put(&handle, nr); + } + } perf_output_end(&handle); } diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 7e58e3ad1508..722c0f54e549 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -855,7 +855,7 @@ static unsigned long total = 0, total_unknown = 0; static int -process_overflow_event(event_t *event, unsigned long offset, unsigned long head) +process_sample_event(event_t *event, unsigned long offset, unsigned long head) { char level; int show = 0; @@ -1013,10 +1013,10 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head) static int process_event(event_t *event, unsigned long offset, unsigned long head) { - if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) - return process_overflow_event(event, offset, head); - switch (event->header.type) { + case PERF_EVENT_SAMPLE: + return process_sample_event(event, offset, head); + case PERF_EVENT_MMAP: return process_mmap_event(event, offset, head); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index e575f3039766..ec5361c67bf5 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -53,6 +53,8 @@ static regex_t parent_regex; static int exclude_other = 1; +static u64 sample_type; + struct ip_event { struct perf_event_header header; u64 ip; @@ -1135,7 +1137,7 @@ static int validate_chain(struct ip_callchain *chain, event_t *event) } static int -process_overflow_event(event_t *event, unsigned long offset, unsigned long head) +process_sample_event(event_t *event, unsigned long offset, unsigned long head) { char level; int show = 0; @@ -1147,12 +1149,12 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head) void *more_data = event->ip.__more_data; struct ip_callchain *chain = NULL; - if (event->header.type & PERF_SAMPLE_PERIOD) { + if (sample_type & PERF_SAMPLE_PERIOD) { period = *(u64 *)more_data; more_data += sizeof(u64); } - dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n", + dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1160,7 +1162,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head) (void *)(long)ip, (long long)period); - if (event->header.type & PERF_SAMPLE_CALLCHAIN) { + if (sample_type & PERF_SAMPLE_CALLCHAIN) { int i; chain = (void *)more_data; @@ -1352,10 +1354,10 @@ process_event(event_t *event, unsigned long offset, unsigned long head) { trace_event(event); - if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) - return process_overflow_event(event, offset, head); - switch (event->header.type) { + case PERF_EVENT_SAMPLE: + return process_sample_event(event, offset, head); + case PERF_EVENT_MMAP: return process_mmap_event(event, offset, head); @@ -1388,18 +1390,21 @@ process_event(event_t *event, unsigned long offset, unsigned long head) static struct perf_header *header; -static int perf_header__has_sample(u64 sample_mask) +static u64 perf_header__sample_type(void) { + u64 sample_type = 0; int i; for (i = 0; i < header->attrs; i++) { struct perf_header_attr *attr = header->attr[i]; - if (!(attr->attr.sample_type & sample_mask)) - return 0; + if (!sample_type) + sample_type = attr->attr.sample_type; + else if (sample_type != attr->attr.sample_type) + die("non matching sample_type"); } - return 1; + return sample_type; } static int __cmd_report(void) @@ -1437,8 +1442,9 @@ static int __cmd_report(void) header = perf_header__read(input); head = header->data_offset; - if (sort__has_parent && - !perf_header__has_sample(PERF_SAMPLE_CALLCHAIN)) { + sample_type = perf_header__sample_type(); + + if (sort__has_parent && !(sample_type & PERF_SAMPLE_CALLCHAIN)) { fprintf(stderr, "selected --sort parent, but no callchain data\n"); exit(-1); } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 5352b5e352ed..cf0d21f1ae10 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -392,11 +392,11 @@ static void record_ip(u64 ip, int counter) samples--; } -static void process_event(u64 ip, int counter) +static void process_event(u64 ip, int counter, int user) { samples++; - if (ip < min_ip || ip > max_ip) { + if (user) { userspace_samples++; return; } @@ -509,9 +509,10 @@ static void mmap_read_counter(struct mmap_data *md) old += size; - if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { - if (event->header.type & PERF_SAMPLE_IP) - process_event(event->ip.ip, md->counter); + if (event->header.type == PERF_EVENT_SAMPLE) { + int user = + (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER; + process_event(event->ip.ip, md->counter, user); } } -- cgit v1.2.3 From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/traps.c | 3 +++ include/linux/kernel.h | 1 + kernel/sysctl.c | 8 ++++++++ 4 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d444..c8405718a4c3 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5671c0..5204332f475d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fac104e7186a..d6320a3e8def 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -303,6 +303,7 @@ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; extern const char *print_tainted(void); extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b5..fba42eda8de2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3 From 0296e4254f3318e0dcad9706fa1daf8e5addc1e9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 11:15:37 +0800 Subject: ftrace: Fix the output of profile The first entry of the ftrace profile was always skipped when reading trace_stat/functionX. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A443D59.4080307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71a52c172140..f3716bf04df6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -291,7 +291,9 @@ function_stat_next(void *v, int idx) pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: - rec++; + if (idx != 0) + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { pg = pg->next; if (!pg) -- cgit v1.2.3 From 19d2e755436054dfc2be640bffc32e427c37ac3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jun 2009 13:10:23 +0200 Subject: perf_counter: Complete counter swap Complete the counter swap by indeed switching the times too and updating the userpage after modifying the counter values. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1246014623.31755.195.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f2f232696587..66ab1e9d1294 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1048,9 +1048,14 @@ static void __perf_counter_sync_stat(struct perf_counter *counter, value = atomic64_xchg(&counter->count, value); atomic64_set(&next_counter->count, value); + swap(counter->total_time_enabled, next_counter->total_time_enabled); + swap(counter->total_time_running, next_counter->total_time_running); + /* - * XXX also sync time_enabled and time_running ? + * Since we swizzled the values, update the user visible data too. */ + perf_counter_update_userpage(counter); + perf_counter_update_userpage(next_counter); } #define list_next_entry(pos, member) \ -- cgit v1.2.3 From a32c7765e2796395aec49f699bd25c407155e9c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 16:55:51 +0800 Subject: tracing: Fix stack tracer sysctl handling This made my machine completely frozen: # echo 1 > /proc/sys/kernel/stack_tracer_enabled # echo 2 > /proc/sys/kernel/stack_tracer_enabled The cause is register_ftrace_function() was called twice. Also fix ftrace_enabled sysctl, though seems nothing bad happened as I tested it. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A448D17.9010305@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_stack.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3716bf04df6..bce9e01a29c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3160,10 +3160,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); - if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = ftrace_enabled; + last_ftrace_enabled = !!ftrace_enabled; if (ftrace_enabled) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2d7aebd71dbd..e644af910124 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -326,10 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || - (last_stack_tracer_enabled == stack_tracer_enabled)) + (last_stack_tracer_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = stack_tracer_enabled; + last_stack_tracer_enabled = !!stack_tracer_enabled; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3 From 82d5308127c3e3404ffbf41e503853c68660b18b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 26 Jun 2009 17:07:02 +0800 Subject: trace_export: Repair missed fields Some fields for struct ftrace_graph_ret are missed when they are exported to user. Signed-off-by: Lai Jiangshan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A448FB6.5000302@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e375134d..6db005e12487 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, ftrace_graph_ret_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ret.func, func) + TRACE_FIELD(unsigned long long, ret.calltime, calltime) + TRACE_FIELD(unsigned long long, ret.rettime, rettime) + TRACE_FIELD(unsigned long, ret.overrun, overrun) TRACE_FIELD(int, ret.depth, depth) ), TP_RAW_FMT("<-- %lx (%d)") -- cgit v1.2.3 From 238a24f626628cb16a3015f332d649f08246ca89 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Jun 2009 15:55:10 +0800 Subject: tracing/fastboot: Document the need of initcall_debug To use boot tracer, one should pass initcall_debug as well as ftrace=initcall to the command line. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A48735E.9050002@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1551f47e7669..019f380fd764 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -226,13 +226,13 @@ config BOOT_TRACER the timings of the initcalls and traces key events and the identity of tasks that can cause boot delays, such as context-switches. - Its aim is to be parsed by the /scripts/bootgraph.pl tool to + Its aim is to be parsed by the scripts/bootgraph.pl tool to produce pretty graphics about boot inefficiencies, giving a visual representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - You must pass in ftrace=initcall to the kernel command line - to enable this on bootup. + You must pass in initcall_debug and ftrace=initcall to the kernel + command line to enable this on bootup. config TRACE_BRANCH_PROFILING bool -- cgit v1.2.3 From 12de38b186c2af97bf0b4a1f907f766df46b1def Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 29 Jun 2009 17:13:55 +0100 Subject: kmemleak: Inform kmemleak about pid_hash Kmemleak does not track alloc_bootmem calls but the pid_hash allocated in pidhash_init() would need to be scanned as it contains pointers to struct pid objects. Signed-off-by: Catalin Marinas --- kernel/pid.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5d3f50..5fa1db48d8b7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -512,6 +513,12 @@ void __init pidhash_init(void) pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); if (!pid_hash) panic("Could not alloc pidhash!\n"); + /* + * pid_hash contains references to allocated struct pid objects and it + * must be scanned by kmemleak to avoid false positives. + */ + kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0, + GFP_KERNEL); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } -- cgit v1.2.3 From 57e7986ed142417498155ebcd5eaf617ac37136d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 30 Jun 2009 16:07:19 +1000 Subject: perf_counter: Provide a way to enable counters on exec This provides a way to mark a counter to be enabled on the next exec. This is useful for measuring the total activity of a program without including overhead from the process that launches it. This also changes the perf stat command to use this new facility. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <19017.43927.838745.689203@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- kernel/perf_counter.c | 50 ++++++++++++++++++++++++++++++++++++++++++++ tools/perf/builtin-stat.c | 6 +++--- 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3078e23c91eb..5e970c7d3fd5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -179,8 +179,9 @@ struct perf_counter_attr { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 66ab1e9d1294..d55a50da2347 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1428,6 +1428,53 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } +/* + * Enable all of a task's counters that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_counter_enable_on_exec(struct task_struct *task) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_counter_ctxp; + if (!ctx || !ctx->nr_counters) + goto out; + + __perf_counter_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (!counter->attr.enable_on_exec) + continue; + counter->attr.enable_on_exec = 0; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + continue; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + enabled = 1; + } + + /* + * Unclone this context if we enabled any counter. + */ + if (enabled && ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + /* * Cross CPU call to read the hardware counter */ @@ -2949,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task) { struct perf_comm_event comm_event; + if (task->perf_counter_ctxp) + perf_counter_enable_on_exec(task); + if (!atomic_read(&nr_comm_counters)) return; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 201ef2367dcb..2e03524a1de0 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -116,8 +116,9 @@ static void create_perf_stat_counter(int counter, int pid) fd[cpu][counter], strerror(errno)); } } else { - attr->inherit = inherit; - attr->disabled = 1; + attr->inherit = inherit; + attr->disabled = 1; + attr->enable_on_exec = 1; fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0); if (fd[0][counter] < 0 && verbose) @@ -262,7 +263,6 @@ static int run_perf_stat(int argc, const char **argv) * Enable counters and exec the command: */ t0 = rdclock(); - prctl(PR_TASK_PERF_COUNTERS_ENABLE); close(go_pipe[1]); wait(&status); -- cgit v1.2.3 From 8bc1ad7dd301b7ca7454013519fa92e8c53655ff Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 30 Jun 2009 11:41:31 -0700 Subject: kernel/resource.c: fix sign extension in reserve_setup() When the 32-bit signed quantities get assigned to the u64 resource_size_t, they are incorrectly sign-extended. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13253 Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9905 Signed-off-by: Zhang Rui Reported-by: Leann Ogasawara Cc: Pierre Ossman Reported-by: Tested-by: Cc: Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ac5f3a36923f..78b087221c15 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -787,7 +787,7 @@ static int __init reserve_setup(char *str) static struct resource reserve[MAXRESERVE]; for (;;) { - int io_start, io_num; + unsigned int io_start, io_num; int x = reserved; if (get_option (&str, &io_start) != 2) -- cgit v1.2.3 From df279ca8966c3de83105428e3391ab17690802a9 Mon Sep 17 00:00:00 2001 From: Renaud Lottiaux Date: Tue, 30 Jun 2009 11:41:34 -0700 Subject: bsdacct: fix access to invalid filp in acct_on() The file opened in acct_on and freshly stored in the ns->bacct struct can be closed in acct_file_reopen by a concurrent call after we release acct_lock and before we call mntput(file->f_path.mnt). Record file->f_path.mnt in a local variable and use this variable only. Signed-off-by: Renaud Lottiaux Signed-off-by: Louis Rilling Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 7afa31564162..9f3391090b3e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, static int acct_on(char *name) { struct file *file; + struct vfsmount *mnt; int error; struct pid_namespace *ns; struct bsd_acct_struct *acct = NULL; @@ -256,11 +257,12 @@ static int acct_on(char *name) acct = NULL; } - mnt_pin(file->f_path.mnt); + mnt = file->f_path.mnt; + mnt_pin(mnt); acct_file_reopen(ns->bacct, file, ns); spin_unlock(&acct_lock); - mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ + mntput(mnt); /* it's pinned, now give up active reference */ kfree(acct); return 0; -- cgit v1.2.3 From 4a2bb6fcc80e6330ca2f2393e98605052cc7780b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 30 Jun 2009 17:08:09 -0400 Subject: kprobes: No need to unlock kprobe_insn_mutex Remove needless kprobe_insn_mutex unlocking during safety check in garbage collection, because if someone releases a dirty slot during safety check (which ensures other cpus doesn't execute all dirty slots), the safety check must be fail. So, we need to hold the mutex while checking safety. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli LKML-Reference: <20090630210809.17851.28781.stgit@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c0fa54b276d9..16b5739c516a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; - int safety; /* Ensure no-one is preepmted on the garbages */ - mutex_unlock(&kprobe_insn_mutex); - safety = check_safety(); - mutex_lock(&kprobe_insn_mutex); - if (safety != 0) + if (check_safety()) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { -- cgit v1.2.3 From e1af3aec3e2e7d21d4b3054323779d478c19d907 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 1 Jul 2009 16:50:25 +0800 Subject: tracing: Fix trace_print_seq() We will lose something if trace_seq->buffer[0] is 0, because the copy length is calculated by strlen() in seq_puts(), so using seq_write() instead of seq_puts(). There have a example: after reboot: # echo kmemtrace > current_tracer # echo 0 > options/kmem_minimalistic # cat trace # tracer: kmemtrace # # Nothing is exported, because the first byte of trace_seq->buffer[ ] is KMEMTRACE_USER_ALLOC. ( the value of KMEMTRACE_USER_ALLOC is zero, seeing kmemtrace_print_alloc_user() in kernel/trace/kmemtrace.c) Signed-off-by: Xiao Guangrong Acked-by: Frederic Weisbecker Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Steven Rostedt LKML-Reference: <4A4B2351.5010300@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7938f3ae93e3..e0c2545622e8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - s->buffer[len] = 0; - seq_puts(m, s->buffer); + seq_write(m, s->buffer, len); trace_seq_init(s); } -- cgit v1.2.3 From 5bfd7560979062ad75c9805c1719cec990b5db29 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 5 Jul 2009 12:08:19 -0700 Subject: Fix virt_to_phys() warnings These warnings were observed on MIPS32 using 2.6.31-rc1 and gcc-4.2.0: mm/page_alloc.c: In function 'alloc_pages_exact': mm/page_alloc.c:1986: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast drivers/usb/mon/mon_bin.c: In function 'mon_alloc_buff': drivers/usb/mon/mon_bin.c:1264: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [akpm@linux-foundation.org: fix kernel/perf_counter.c too] Signed-off-by: Kevin Cernekee Cc: Andi Kleen Cc: Ralf Baechle Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/mon/mon_bin.c | 2 +- kernel/perf_counter.c | 2 +- mm/page_alloc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index f8d9045d668a..0f7a30b7d2d1 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1261,7 +1261,7 @@ static int mon_alloc_buff(struct mon_pgmap *map, int npages) return -ENOMEM; } map[n].ptr = (unsigned char *) vaddr; - map[n].pg = virt_to_page(vaddr); + map[n].pg = virt_to_page((void *) vaddr); } return 0; } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da2347..a641eb753b8c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2020,7 +2020,7 @@ fail: static void perf_mmap_free_page(unsigned long addr) { - struct page *page = virt_to_page(addr); + struct page *page = virt_to_page((void *)addr); page->mapping = NULL; __free_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0f2cdf9d8b1..ad7cd1c56b07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1983,7 +1983,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; -- cgit v1.2.3 From 793285fcafce4719a05e0c99fa74b188157fe7fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 5 Jul 2009 12:08:26 -0700 Subject: cred_guard_mutex: do not return -EINTR to user-space do_execve() and ptrace_attach() return -EINTR if mutex_lock_interruptible(->cred_guard_mutex) fails. This is not right, change the code to return ERESTARTNOINTR. Perhaps we should also change proc_pid_attr_write(). Signed-off-by: Oleg Nesterov Cc: David Howells Acked-by: Roland McGrath Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 4 ++-- fs/exec.c | 4 ++-- kernel/ptrace.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/compat.c b/fs/compat.c index cdd51a3a7c53..fbadb947727b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1486,8 +1486,8 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) goto out_free; current->in_execve = 1; diff --git a/fs/exec.c b/fs/exec.c index e639957d7a57..4a8849e45b21 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1277,8 +1277,8 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) goto out_free; current->in_execve = 1; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61c78b2c07ba..082c320e4dbf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task) * interference; SUID, SGID and LSM creds get determined differently * under ptrace. */ - retval = mutex_lock_interruptible(&task->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(&task->cred_guard_mutex)) goto out; task_lock(task); -- cgit v1.2.3 From b43f3cbd21ffbd719fd4fa6642bfe6af255ded34 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Jul 2009 01:54:37 +0400 Subject: headers: mnt_namespace.h redux Fix various silly problems wrt mnt_namespace.h: - exit_mnt_ns() isn't used, remove it - done that, sched.h and nsproxy.h inclusions aren't needed - mount.h inclusion was need for vfsmount_lock, but no longer - remove mnt_namespace.h inclusion from files which don't use anything from mnt_namespace.h Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- fs/afs/mntpt.c | 1 - fs/namespace.c | 1 + fs/nfs/getroot.c | 1 - fs/reiserfs/super.c | 1 - include/linux/mnt_namespace.h | 13 ++----------- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/kmod.c | 1 - 8 files changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index c52be53f6946..5ffb570cd3a8 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "internal.h" diff --git a/fs/namespace.c b/fs/namespace.c index 3dc283fd4716..277c28a63ead 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 46177cb87064..b35d2a616066 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d3aeb061612b..7adea74d6a8a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 3beb2592b03f..d74785c2393a 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -2,10 +2,9 @@ #define _NAMESPACE_H_ #ifdef __KERNEL__ -#include -#include -#include +#include #include +#include struct mnt_namespace { atomic_t count; @@ -28,14 +27,6 @@ extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt); extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); - -static inline void exit_mnt_ns(struct task_struct *p) -{ - struct mnt_namespace *ns = p->nsproxy->mnt_ns; - if (ns) - put_mnt_ns(ns); -} - static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); diff --git a/kernel/exit.c b/kernel/exit.c index 628d41f0dd54..869dc221733e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0aa..bd2959228871 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/kmod.c b/kernel/kmod.c index 7e95bedb2bfc..385c31a1bdbf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 6 Jul 2009 13:05:40 -0700 Subject: Remove multiple KERN_ prefixes from printk formats Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up handling of log-levels and newlines") changed printk semantics. printk lines with multiple KERN_ prefixes are no longer emitted as before the patch. is now included in the output on each additional use. Remove all uses of multiple KERN_s in formats. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- arch/avr32/kernel/traps.c | 13 ++++---- arch/blackfin/kernel/setup.c | 41 ++++++++++++------------- arch/blackfin/kernel/traps.c | 34 ++++++++++----------- arch/m68knommu/kernel/process.c | 21 +++++++------ arch/m68knommu/kernel/traps.c | 6 ++-- arch/mn10300/kernel/traps.c | 21 +++++-------- arch/parisc/kernel/process.c | 2 +- arch/parisc/kernel/traps.c | 20 ++++++------- arch/um/kernel/sysrq.c | 4 +-- arch/x86/kernel/apic/io_apic.c | 1 - arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++---- arch/x86/kernel/e820.c | 7 ++--- arch/x86/kernel/pci-gart_64.c | 2 +- arch/x86/mm/fault.c | 9 +++--- arch/xtensa/kernel/traps.c | 6 ++-- drivers/block/amiflop.c | 2 +- drivers/block/xsysace.c | 7 +++-- drivers/char/hw_random/intel-rng.c | 9 +++--- drivers/char/isicom.c | 16 +++++----- drivers/i2c/busses/i2c-ibm_iic.c | 9 +++--- drivers/md/md.c | 18 ++++++----- drivers/misc/sgi-xp/xpnet.c | 4 +-- drivers/net/a2065.c | 12 ++------ drivers/net/arcnet/arcnet.c | 26 +++++++--------- drivers/net/bmac.c | 7 +++-- drivers/net/bnx2x_main.c | 7 +++-- drivers/net/dl2k.c | 17 ++++++----- drivers/net/epic100.c | 5 ++-- drivers/net/fealnx.c | 15 ++++++---- drivers/net/hamachi.c | 23 +++++++------- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/net/hamradio/baycom_par.c | 2 +- drivers/net/hamradio/baycom_ser_fdx.c | 2 +- drivers/net/hamradio/baycom_ser_hdx.c | 2 +- drivers/net/natsemi.c | 4 +-- drivers/net/ne.c | 2 +- drivers/net/pci-skeleton.c | 2 +- drivers/net/pcmcia/ibmtr_cs.c | 13 ++++---- drivers/net/pcmcia/nmclan_cs.c | 15 ++++------ drivers/net/pcnet32.c | 50 +++++++++++++++---------------- drivers/net/starfire.c | 2 +- drivers/net/sundance.c | 6 ++-- drivers/net/tsi108_eth.c | 8 +++-- drivers/net/tulip/de2104x.c | 7 +++-- drivers/net/tulip/tulip_core.c | 14 +++++---- drivers/net/tulip/winbond-840.c | 6 ++-- drivers/net/wan/hd64570.c | 3 +- drivers/net/wan/hd64572.c | 3 +- drivers/net/wan/sbni.c | 8 ++--- drivers/net/wireless/ray_cs.c | 9 +++--- drivers/net/wireless/wavelan_cs.c | 13 ++------ drivers/net/yellowfin.c | 23 +++++++------- drivers/parisc/eisa_enumerator.c | 14 ++++----- drivers/pcmcia/tcic.c | 3 +- drivers/scsi/atari_NCR5380.c | 3 +- drivers/scsi/mac53c94.c | 5 ++-- drivers/scsi/sg.c | 2 +- drivers/scsi/sun3_NCR5380.c | 3 +- drivers/serial/8250_pci.c | 11 +++---- drivers/ssb/pcmcia.c | 4 +-- drivers/usb/core/hcd.c | 2 +- drivers/video/amba-clcd.c | 7 +++-- drivers/video/matrox/matroxfb_DAC1064.c | 4 +-- drivers/video/matrox/matroxfb_Ti3026.c | 4 +-- drivers/video/stifb.c | 7 ++--- fs/jffs2/erase.c | 10 ++++--- kernel/module.c | 6 ++-- sound/pci/emu10k1/p16v.c | 2 +- sound/usb/usx2y/usbusx2yaudio.c | 7 +++-- 70 files changed, 330 insertions(+), 338 deletions(-) (limited to 'kernel') diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 6e3d491184ea..b91b2044af9c 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -32,22 +32,25 @@ void NORET_TYPE die(const char *str, struct pt_regs *regs, long err) spin_lock_irq(&die_lock); bust_spinlocks(1); - printk(KERN_ALERT "Oops: %s, sig: %ld [#%d]\n" KERN_EMERG, + printk(KERN_ALERT "Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + + printk(KERN_EMERG); + #ifdef CONFIG_PREEMPT - printk("PREEMPT "); + printk(KERN_CONT "PREEMPT "); #endif #ifdef CONFIG_FRAME_POINTER - printk("FRAME_POINTER "); + printk(KERN_CONT "FRAME_POINTER "); #endif if (current_cpu_data.features & AVR32_FEATURE_OCD) { unsigned long did = ocd_read(DID); - printk("chip: 0x%03lx:0x%04lx rev %lu\n", + printk(KERN_CONT "chip: 0x%03lx:0x%04lx rev %lu\n", (did >> 1) & 0x7ff, (did >> 12) & 0x7fff, (did >> 28) & 0xf); } else { - printk("cpu: arch %u r%u / core %u r%u\n", + printk(KERN_CONT "cpu: arch %u r%u / core %u r%u\n", current_cpu_data.arch_type, current_cpu_data.arch_revision, current_cpu_data.cpu_type, diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c index 298f023bcc09..6136c33e919f 100644 --- a/arch/blackfin/kernel/setup.c +++ b/arch/blackfin/kernel/setup.c @@ -408,13 +408,14 @@ static void __init print_memory_map(char *who) bfin_memmap.map[i].addr + bfin_memmap.map[i].size); switch (bfin_memmap.map[i].type) { case BFIN_MEMMAP_RAM: - printk("(usable)\n"); - break; + printk(KERN_CONT "(usable)\n"); + break; case BFIN_MEMMAP_RESERVED: - printk("(reserved)\n"); - break; - default: printk("type %lu\n", bfin_memmap.map[i].type); - break; + printk(KERN_CONT "(reserved)\n"); + break; + default: + printk(KERN_CONT "type %lu\n", bfin_memmap.map[i].type); + break; } } } @@ -614,19 +615,19 @@ static __init void memory_setup(void) printk(KERN_INFO "Kernel Managed Memory: %ldMB\n", _ramend >> 20); printk(KERN_INFO "Memory map:\n" - KERN_INFO " fixedcode = 0x%p-0x%p\n" - KERN_INFO " text = 0x%p-0x%p\n" - KERN_INFO " rodata = 0x%p-0x%p\n" - KERN_INFO " bss = 0x%p-0x%p\n" - KERN_INFO " data = 0x%p-0x%p\n" - KERN_INFO " stack = 0x%p-0x%p\n" - KERN_INFO " init = 0x%p-0x%p\n" - KERN_INFO " available = 0x%p-0x%p\n" + " fixedcode = 0x%p-0x%p\n" + " text = 0x%p-0x%p\n" + " rodata = 0x%p-0x%p\n" + " bss = 0x%p-0x%p\n" + " data = 0x%p-0x%p\n" + " stack = 0x%p-0x%p\n" + " init = 0x%p-0x%p\n" + " available = 0x%p-0x%p\n" #ifdef CONFIG_MTD_UCLINUX - KERN_INFO " rootfs = 0x%p-0x%p\n" + " rootfs = 0x%p-0x%p\n" #endif #if DMA_UNCACHED_REGION > 0 - KERN_INFO " DMA Zone = 0x%p-0x%p\n" + " DMA Zone = 0x%p-0x%p\n" #endif , (void *)FIXED_CODE_START, (void *)FIXED_CODE_END, _stext, _etext, @@ -859,13 +860,13 @@ void __init setup_arch(char **cmdline_p) #endif printk(KERN_INFO "Hardware Trace "); if (bfin_read_TBUFCTL() & 0x1) - printk("Active "); + printk(KERN_CONT "Active "); else - printk("Off "); + printk(KERN_CONT "Off "); if (bfin_read_TBUFCTL() & 0x2) - printk("and Enabled\n"); + printk(KERN_CONT "and Enabled\n"); else - printk("and Disabled\n"); + printk(KERN_CONT "and Disabled\n"); #if defined(CONFIG_CHR_DEV_FLASH) || defined(CONFIG_BLK_DEV_FLASH) /* we need to initialize the Flashrom device here since we might diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c index 8eeb457ce5d5..8a1caf2bb5b9 100644 --- a/arch/blackfin/kernel/traps.c +++ b/arch/blackfin/kernel/traps.c @@ -212,7 +212,7 @@ asmlinkage void double_fault_c(struct pt_regs *fp) console_verbose(); oops_in_progress = 1; #ifdef CONFIG_DEBUG_VERBOSE - printk(KERN_EMERG "\n" KERN_EMERG "Double Fault\n"); + printk(KERN_EMERG "Double Fault\n"); #ifdef CONFIG_DEBUG_DOUBLEFAULT_PRINT if (((long)fp->seqstat & SEQSTAT_EXCAUSE) == VEC_UNCOV) { unsigned int cpu = smp_processor_id(); @@ -583,15 +583,14 @@ asmlinkage void trap_c(struct pt_regs *fp) #ifndef CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE if (trapnr == VEC_CPLB_I_M || trapnr == VEC_CPLB_M) verbose_printk(KERN_NOTICE "No trace since you do not have " - "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n" - KERN_NOTICE "\n"); + "CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled\n\n"); else #endif dump_bfin_trace_buffer(); if (oops_in_progress) { /* Dump the current kernel stack */ - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "Kernel Stack\n"); + verbose_printk(KERN_NOTICE "Kernel Stack\n"); show_stack(current, NULL); print_modules(); #ifndef CONFIG_ACCESS_CHECK @@ -906,7 +905,7 @@ void show_stack(struct task_struct *task, unsigned long *stack) ret_addr = 0; if (!j && i % 8 == 0) - printk("\n" KERN_NOTICE "%p:",addr); + printk(KERN_NOTICE "%p:",addr); /* if it is an odd address, or zero, just skip it */ if (*addr & 0x1 || !*addr) @@ -996,9 +995,9 @@ void dump_bfin_process(struct pt_regs *fp) printk(KERN_NOTICE "CPU = %d\n", current_thread_info()->cpu); if (!((unsigned long)current->mm & 0x3) && (unsigned long)current->mm >= FIXED_CODE_START) - verbose_printk(KERN_NOTICE "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" - KERN_NOTICE " BSS = 0x%p-0x%p USER-STACK = 0x%p\n" - KERN_NOTICE "\n", + verbose_printk(KERN_NOTICE + "TEXT = 0x%p-0x%p DATA = 0x%p-0x%p\n" + " BSS = 0x%p-0x%p USER-STACK = 0x%p\n\n", (void *)current->mm->start_code, (void *)current->mm->end_code, (void *)current->mm->start_data, @@ -1009,8 +1008,8 @@ void dump_bfin_process(struct pt_regs *fp) else verbose_printk(KERN_NOTICE "invalid mm\n"); } else - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE - "No Valid process in current context\n"); + verbose_printk(KERN_NOTICE + "No Valid process in current context\n"); #endif } @@ -1028,7 +1027,7 @@ void dump_bfin_mem(struct pt_regs *fp) addr < (unsigned short *)((unsigned long)erraddr & ~0xF) + 0x10; addr++) { if (!((unsigned long)addr & 0xF)) - verbose_printk("\n" KERN_NOTICE "0x%p: ", addr); + verbose_printk(KERN_NOTICE "0x%p: ", addr); if (!get_instruction(&val, addr)) { val = 0; @@ -1056,9 +1055,9 @@ void dump_bfin_mem(struct pt_regs *fp) oops_in_progress)){ verbose_printk(KERN_NOTICE "Looks like this was a deferred error - sorry\n"); #ifndef CONFIG_DEBUG_HWERR - verbose_printk(KERN_NOTICE "The remaining message may be meaningless\n" - KERN_NOTICE "You should enable CONFIG_DEBUG_HWERR to get a" - " better idea where it came from\n"); + verbose_printk(KERN_NOTICE +"The remaining message may be meaningless\n" +"You should enable CONFIG_DEBUG_HWERR to get a better idea where it came from\n"); #else /* If we are handling only one peripheral interrupt * and current mm and pid are valid, and the last error @@ -1114,9 +1113,10 @@ void show_regs(struct pt_regs *fp) verbose_printk(KERN_NOTICE "%s", linux_banner); - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "SEQUENCER STATUS:\t\t%s\n", print_tainted()); + verbose_printk(KERN_NOTICE "\nSEQUENCER STATUS:\t\t%s\n", + print_tainted()); verbose_printk(KERN_NOTICE " SEQSTAT: %08lx IPEND: %04lx SYSCFG: %04lx\n", - (long)fp->seqstat, fp->ipend, fp->syscfg); + (long)fp->seqstat, fp->ipend, fp->syscfg); if ((fp->seqstat & SEQSTAT_EXCAUSE) == VEC_HWERR) { verbose_printk(KERN_NOTICE " HWERRCAUSE: 0x%lx\n", (fp->seqstat & SEQSTAT_HWERRCAUSE) >> 14); @@ -1184,7 +1184,7 @@ unlock: verbose_printk(KERN_NOTICE "ICPLB_FAULT_ADDR: %s\n", buf); } - verbose_printk(KERN_NOTICE "\n" KERN_NOTICE "PROCESSOR STATE:\n"); + verbose_printk(KERN_NOTICE "PROCESSOR STATE:\n"); verbose_printk(KERN_NOTICE " R0 : %08lx R1 : %08lx R2 : %08lx R3 : %08lx\n", fp->r0, fp->r1, fp->r2, fp->r3); verbose_printk(KERN_NOTICE " R4 : %08lx R5 : %08lx R6 : %08lx R7 : %08lx\n", diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 1e96c6eb6312..8f8f4abab2ff 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -290,7 +290,7 @@ void dump(struct pt_regs *fp) unsigned char *tp; int i; - printk(KERN_EMERG "\n" KERN_EMERG "CURRENT PROCESS:\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\nCURRENT PROCESS:\n\n"); printk(KERN_EMERG "COMM=%s PID=%d\n", current->comm, current->pid); if (current->mm) { @@ -301,8 +301,7 @@ void dump(struct pt_regs *fp) (int) current->mm->end_data, (int) current->mm->end_data, (int) current->mm->brk); - printk(KERN_EMERG "USER-STACK=%08x KERNEL-STACK=%08x\n" - KERN_EMERG "\n", + printk(KERN_EMERG "USER-STACK=%08x KERNEL-STACK=%08x\n\n", (int) current->mm->start_stack, (int)(((unsigned long) current) + THREAD_SIZE)); } @@ -313,35 +312,35 @@ void dump(struct pt_regs *fp) fp->d0, fp->d1, fp->d2, fp->d3); printk(KERN_EMERG "d4: %08lx d5: %08lx a0: %08lx a1: %08lx\n", fp->d4, fp->d5, fp->a0, fp->a1); - printk(KERN_EMERG "\n" KERN_EMERG "USP: %08x TRAPFRAME: %08x\n", + printk(KERN_EMERG "\nUSP: %08x TRAPFRAME: %08x\n", (unsigned int) rdusp(), (unsigned int) fp); - printk(KERN_EMERG "\n" KERN_EMERG "CODE:"); + printk(KERN_EMERG "\nCODE:"); tp = ((unsigned char *) fp->pc) - 0x20; for (sp = (unsigned long *) tp, i = 0; (i < 0x40); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); printk(KERN_EMERG "KERNEL STACK:"); tp = ((unsigned char *) fp) - 0x40; for (sp = (unsigned long *) tp, i = 0; (i < 0xc0); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); printk(KERN_EMERG "USER STACK:"); tp = (unsigned char *) (rdusp() - 0x10); for (sp = (unsigned long *) tp, i = 0; (i < 0x80); i += 4) { if ((i % 0x10) == 0) - printk("\n" KERN_EMERG "%08x: ", (int) (tp + i)); + printk(KERN_EMERG "%08x: ", (int) (tp + i)); printk("%08x ", (int) *sp++); } - printk("\n" KERN_EMERG "\n"); + printk(KERN_EMERG "\n"); } /* diff --git a/arch/m68knommu/kernel/traps.c b/arch/m68knommu/kernel/traps.c index 51d325343ab5..3739c8f657d7 100644 --- a/arch/m68knommu/kernel/traps.c +++ b/arch/m68knommu/kernel/traps.c @@ -111,7 +111,7 @@ static void print_this_address(unsigned long addr, int i) if (i % 5) printk(KERN_CONT " [%08lx] ", addr); else - printk(KERN_CONT "\n" KERN_EMERG " [%08lx] ", addr); + printk(KERN_EMERG " [%08lx] ", addr); i++; #endif } @@ -137,8 +137,8 @@ static void __show_stack(struct task_struct *task, unsigned long *stack) if (stack + 1 + i > endstack) break; if (i % 8 == 0) - printk("\n" KERN_EMERG " "); - printk(" %08lx", *(stack + i)); + printk(KERN_EMERG " "); + printk(KERN_CONT " %08lx", *(stack + i)); } printk("\n"); i = 0; diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index 681ad8c9e4fb..0dfdc5001124 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -136,8 +136,7 @@ void show_trace(unsigned long *sp) unsigned long *stack, addr, module_start, module_end; int i; - printk(KERN_EMERG "\n" - KERN_EMERG "Call Trace:"); + printk(KERN_EMERG "\nCall Trace:"); stack = sp; i = 0; @@ -153,7 +152,7 @@ void show_trace(unsigned long *sp) printk("\n"); #else if ((i % 6) == 0) - printk("\n" KERN_EMERG " "); + printk(KERN_EMERG " "); printk("[<%08lx>] ", addr); i++; #endif @@ -180,7 +179,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (((long) stack & (THREAD_SIZE - 1)) == 0) break; if ((i % 8) == 0) - printk("\n" KERN_EMERG " "); + printk(KERN_EMERG " "); printk("%08lx ", *stack++); } @@ -264,8 +263,7 @@ void show_registers(struct pt_regs *regs) show_stack(current, (unsigned long *) sp); #if 0 - printk(KERN_EMERG "\n" - KERN_EMERG "Code: "); + printk(KERN_EMERG "\nCode: "); if (regs->pc < PAGE_OFFSET) goto bad; @@ -311,16 +309,14 @@ void die(const char *str, struct pt_regs *regs, enum exception_code code) { console_verbose(); spin_lock_irq(&die_lock); - printk(KERN_EMERG "\n" - KERN_EMERG "%s: %04x\n", + printk(KERN_EMERG "\n%s: %04x\n", str, code & 0xffff); show_registers(regs); if (regs->pc >= 0x02000000 && regs->pc < 0x04000000 && (regs->epsw & (EPSW_IM | EPSW_IE)) != (EPSW_IM | EPSW_IE)) { printk(KERN_EMERG "Exception in usermode interrupt handler\n"); - printk(KERN_EMERG "\n" - KERN_EMERG " Please connect to kernel debugger !!\n"); + printk(KERN_EMERG "\nPlease connect to kernel debugger !!\n"); asm volatile ("0: bra 0b"); } @@ -429,9 +425,8 @@ asmlinkage void io_bus_error(u32 bcberr, u32 bcbear, struct pt_regs *regs) { console_verbose(); - printk(KERN_EMERG "\n" - KERN_EMERG "Asynchronous I/O Bus Error\n" - KERN_EMERG "==========================\n"); + printk(KERN_EMERG "Asynchronous I/O Bus Error\n"); + printk(KERN_EMERG "==========================\n"); if (bcberr & BCBERR_BEME) printk(KERN_EMERG "- Multiple recorded errors\n"); diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 61c07078c072..1f3aa8db0203 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -156,7 +156,7 @@ void machine_power_off(void) * software. The user has to press the button himself. */ printk(KERN_EMERG "System shut down completed.\n" - KERN_EMERG "Please power this system off now."); + "Please power this system off now."); } void (*pm_power_off)(void) = machine_power_off; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index c32f5d6d778e..528f0ff9b273 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -250,15 +250,14 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) oops_enter(); /* Amuse the user in a SPARC fashion */ - if (err) printk( -KERN_CRIT " _______________________________ \n" -KERN_CRIT " < Your System ate a SPARC! Gah! >\n" -KERN_CRIT " ------------------------------- \n" -KERN_CRIT " \\ ^__^\n" -KERN_CRIT " \\ (xx)\\_______\n" -KERN_CRIT " (__)\\ )\\/\\\n" -KERN_CRIT " U ||----w |\n" -KERN_CRIT " || ||\n"); + if (err) printk(KERN_CRIT + " _______________________________ \n" + " < Your System ate a SPARC! Gah! >\n" + " ------------------------------- \n" + " \\ ^__^\n" + " (__)\\ )\\/\\\n" + " U ||----w |\n" + " || ||\n"); /* unlock the pdc lock if necessary */ pdc_emergency_unlock(); @@ -797,7 +796,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs) else printk(KERN_DEBUG "User Fault (long pointer) (fault %d) ", code); - printk("pid=%d command='%s'\n", task_pid_nr(current), current->comm); + printk(KERN_CONT "pid=%d command='%s'\n", + task_pid_nr(current), current->comm); show_regs(regs); #endif si.si_signo = SIGSEGV; diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 56d43d0a3960..0960de54495a 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -70,8 +70,8 @@ void show_stack(struct task_struct *task, unsigned long *esp) if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n" KERN_INFO " "); - printk("%08lx ", *stack++); + printk(KERN_INFO " "); + printk(KERN_CONT "%08lx ", *stack++); } show_trace(task, esp); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8fd1efb5a0bd..90b5e6efa938 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "\n"); printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index ae068f59603f..2a50ef891000 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1259,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { static const char ACPI_PSS_BIOS_BUG_MSG[] = KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index af425b83202b..484c1e5f658e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,14 +194,14 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); @@ -209,13 +209,13 @@ static void print_mce(struct mce *m) static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c4ca89d9aaf4..5cb5725b2bae 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -627,10 +627,9 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f9063896..d2e56b8f48e7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff857be..85307cc6e45f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index ba9ab9349782..e64efac3b9db 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -354,10 +354,10 @@ void show_regs(struct pt_regs * regs) for (i = 0; i < 16; i++) { if ((i % 8) == 0) - printk ("\n" KERN_INFO "a%02d: ", i); - printk("%08lx ", regs->areg[i]); + printk(KERN_INFO "a%02d:", i); + printk(KERN_CONT " %08lx", regs->areg[i]); } - printk("\n"); + printk(KERN_CONT "\n"); printk("pc: %08lx, ps: %08lx, depc: %08lx, excvaddr: %08lx\n", regs->pc, regs->ps, regs->depc, regs->excvaddr); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 9c6e5b0fe894..2f07b7c99a95 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1645,7 +1645,7 @@ static int __init fd_probe_drives(void) { int drive,drives,nomem; - printk(KERN_INFO "FD: probing units\n" KERN_INFO "found "); + printk(KERN_INFO "FD: probing units\nfound "); drives=0; nomem=0; for(drive=0;drivedev, " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" - KERN_INFO " status:%.8x mpu_lba:%.8x busmode:%4x\n" - KERN_INFO " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", + dev_info(ace->dev, + " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" + " status:%.8x mpu_lba:%.8x busmode:%4x\n" + " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", ace_in32(ace, ACE_CTRL), ace_in(ace, ACE_SECCNTCMD), ace_in(ace, ACE_VERSION), diff --git a/drivers/char/hw_random/intel-rng.c b/drivers/char/hw_random/intel-rng.c index 5dcbe603eca2..91b53eb1c053 100644 --- a/drivers/char/hw_random/intel-rng.c +++ b/drivers/char/hw_random/intel-rng.c @@ -305,10 +305,11 @@ static int __init intel_init_hw_struct(struct intel_rng_hw *intel_rng_hw, (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK)) == BIOS_CNTL_LOCK_ENABLE_MASK) { static __initdata /*const*/ char warning[] = - KERN_WARNING PFX "Firmware space is locked read-only. If you can't or\n" - KERN_WARNING PFX "don't want to disable this in firmware setup, and if\n" - KERN_WARNING PFX "you are certain that your system has a functional\n" - KERN_WARNING PFX "RNG, try using the 'no_fwh_detect' option.\n"; + KERN_WARNING +PFX "Firmware space is locked read-only. If you can't or\n" +PFX "don't want to disable this in firmware setup, and if\n" +PFX "you are certain that your system has a functional\n" +PFX "RNG, try using the 'no_fwh_detect' option.\n"; if (no_fwh_detect) return -ENODEV; diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c index 4159292e35cf..621d1184673c 100644 --- a/drivers/char/isicom.c +++ b/drivers/char/isicom.c @@ -1478,10 +1478,10 @@ static int __devinit load_firmware(struct pci_dev *pdev, status = inw(base + 0x4); if (status != 0) { dev_warn(&pdev->dev, "Card%d rejected load header:\n" - KERN_WARNING "Address:0x%x\n" - KERN_WARNING "Count:0x%x\n" - KERN_WARNING "Status:0x%x\n", - index + 1, frame->addr, frame->count, status); + "Address:0x%x\n" + "Count:0x%x\n" + "Status:0x%x\n", + index + 1, frame->addr, frame->count, status); goto errrelfw; } outsw(base, frame->data, word_count); @@ -1526,10 +1526,10 @@ static int __devinit load_firmware(struct pci_dev *pdev, status = inw(base + 0x4); if (status != 0) { dev_warn(&pdev->dev, "Card%d rejected verify header:\n" - KERN_WARNING "Address:0x%x\n" - KERN_WARNING "Count:0x%x\n" - KERN_WARNING "Status: 0x%x\n", - index + 1, frame->addr, frame->count, status); + "Address:0x%x\n" + "Count:0x%x\n" + "Status: 0x%x\n", + index + 1, frame->addr, frame->count, status); goto errrelfw; } diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index e4476743f203..b1bc6e277d2a 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -85,10 +85,11 @@ static void dump_iic_regs(const char* header, struct ibm_iic_private* dev) { volatile struct iic_regs __iomem *iic = dev->vaddr; printk(KERN_DEBUG "ibm-iic%d: %s\n", dev->idx, header); - printk(KERN_DEBUG " cntl = 0x%02x, mdcntl = 0x%02x\n" - KERN_DEBUG " sts = 0x%02x, extsts = 0x%02x\n" - KERN_DEBUG " clkdiv = 0x%02x, xfrcnt = 0x%02x\n" - KERN_DEBUG " xtcntlss = 0x%02x, directcntl = 0x%02x\n", + printk(KERN_DEBUG + " cntl = 0x%02x, mdcntl = 0x%02x\n" + " sts = 0x%02x, extsts = 0x%02x\n" + " clkdiv = 0x%02x, xfrcnt = 0x%02x\n" + " xtcntlss = 0x%02x, directcntl = 0x%02x\n", in_8(&iic->cntl), in_8(&iic->mdcntl), in_8(&iic->sts), in_8(&iic->extsts), in_8(&iic->clkdiv), in_8(&iic->xfrcnt), in_8(&iic->xtcntlss), in_8(&iic->directcntl)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f4a70c43ffc..d4351ff0849f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1756,9 +1756,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb) __u8 *uuid; uuid = sb->set_uuid; - printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" - ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" - KERN_INFO "md: Name: \"%s\" CT:%llu\n", + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" + ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" + "md: Name: \"%s\" CT:%llu\n", le32_to_cpu(sb->major_version), le32_to_cpu(sb->feature_map), uuid[0], uuid[1], uuid[2], uuid[3], @@ -1770,12 +1771,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb) & MD_SUPERBLOCK_1_TIME_SEC_MASK); uuid = sb->device_uuid; - printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" " RO:%llu\n" - KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" - ":%02x%02x%02x%02x%02x%02x\n" - KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - KERN_INFO "md: (MaxDev:%u) \n", + "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" + ":%02x%02x%02x%02x%02x%02x\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", le32_to_cpu(sb->level), (unsigned long long)le64_to_cpu(sb->size), le32_to_cpu(sb->raid_disks), diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c index 8d1c60a3f0df..5d778ec8cdb2 100644 --- a/drivers/misc/sgi-xp/xpnet.c +++ b/drivers/misc/sgi-xp/xpnet.c @@ -235,7 +235,7 @@ xpnet_receive(short partid, int channel, struct xpnet_message *msg) skb->ip_summed = CHECKSUM_UNNECESSARY; dev_dbg(xpnet, "passing skb to network layer\n" - KERN_DEBUG "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p " + "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p " "skb->end=0x%p skb->len=%d\n", (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb), skb->len); @@ -399,7 +399,7 @@ xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg, msg->buf_pa = xp_pa((void *)start_addr); dev_dbg(xpnet, "sending XPC message to %d:%d\n" - KERN_DEBUG "msg->buf_pa=0x%lx, msg->size=%u, " + "msg->buf_pa=0x%lx, msg->size=%u, " "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n", dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size, msg->leadin_ignore, msg->tailout_ignore); diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c index 85a18175730b..08787f5a22a3 100644 --- a/drivers/net/a2065.c +++ b/drivers/net/a2065.c @@ -569,16 +569,8 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev) #ifdef DEBUG_DRIVER /* dump the packet */ - { - int i; - - for (i = 0; i < 64; i++) { - if ((i % 16) == 0) - printk("\n" KERN_DEBUG); - printk ("%2.2x ", skb->data [i]); - } - printk("\n"); - } + print_hex_dump(KERN_DEBUG, "skb->data: ", DUMP_PREFIX_NONE, + 16, 1, skb->data, 64, true); #endif entry = lp->tx_new & lp->tx_ring_mod_mask; ib->btx_ring [entry].length = (-skblen) | 0xf000; diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index d6d4ab3b430c..7d227cdab9f8 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -158,15 +158,12 @@ module_exit(arcnet_exit); void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc) { - int i; + char hdr[32]; - printk(KERN_DEBUG "%6s: skb dump (%s) follows:", dev->name, desc); - for (i = 0; i < skb->len; i++) { - if (i % 16 == 0) - printk("\n" KERN_DEBUG "[%04X] ", i); - printk("%02X ", ((u_char *) skb->data)[i]); - } - printk("\n"); + /* dump the packet */ + snprintf(hdr, sizeof(hdr), "%6s:%s skb->data:", dev->name, desc); + print_hex_dump(KERN_DEBUG, hdr, DUMP_PREFIX_OFFSET, + 16, 1, skb->data, skb->len, true); } EXPORT_SYMBOL(arcnet_dump_skb); @@ -184,6 +181,7 @@ static void arcnet_dump_packet(struct net_device *dev, int bufnum, int i, length; unsigned long flags = 0; static uint8_t buf[512]; + char hdr[32]; /* hw.copy_from_card expects IRQ context so take the IRQ lock to keep it single threaded */ @@ -197,14 +195,10 @@ static void arcnet_dump_packet(struct net_device *dev, int bufnum, /* if the offset[0] byte is nonzero, this is a 256-byte packet */ length = (buf[2] ? 256 : 512); - printk(KERN_DEBUG "%6s: packet dump (%s) follows:", dev->name, desc); - for (i = 0; i < length; i++) { - if (i % 16 == 0) - printk("\n" KERN_DEBUG "[%04X] ", i); - printk("%02X ", buf[i]); - } - printk("\n"); - + /* dump the packet */ + snprintf(hdr, sizeof(hdr), "%6s:%s packet dump:", dev->name, desc); + print_hex_dump(KERN_DEBUG, hdr, DUMP_PREFIX_OFFSET, + 16, 1, buf, length, true); } #else diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c index 9578a3dfac01..a76315dc7767 100644 --- a/drivers/net/bmac.c +++ b/drivers/net/bmac.c @@ -428,10 +428,11 @@ bmac_init_phy(struct net_device *dev) printk(KERN_DEBUG "phy registers:"); for (addr = 0; addr < 32; ++addr) { if ((addr & 7) == 0) - printk("\n" KERN_DEBUG); - printk(" %.4x", bmac_mif_read(dev, addr)); + printk(KERN_DEBUG); + printk(KERN_CONT " %.4x", bmac_mif_read(dev, addr)); } - printk("\n"); + print(KERN_CONT "\n"); + if (bp->is_bmac_plus) { unsigned int capable, ctrl; diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c index 6c67be679764..c36a5f33739f 100644 --- a/drivers/net/bnx2x_main.c +++ b/drivers/net/bnx2x_main.c @@ -484,8 +484,9 @@ static void bnx2x_fw_dump(struct bnx2x *bp) mark = REG_RD(bp, MCP_REG_MCPR_SCRATCH + 0xf104); mark = ((mark + 0x3) & ~0x3); - printk(KERN_ERR PFX "begin fw dump (mark 0x%x)\n" KERN_ERR, mark); + printk(KERN_ERR PFX "begin fw dump (mark 0x%x)\n", mark); + printk(KERN_ERR PFX); for (offset = mark - 0x08000000; offset <= 0xF900; offset += 0x8*4) { for (word = 0; word < 8; word++) data[word] = htonl(REG_RD(bp, MCP_REG_MCPR_SCRATCH + @@ -500,7 +501,7 @@ static void bnx2x_fw_dump(struct bnx2x *bp) data[8] = 0x0; printk(KERN_CONT "%s", (char *)data); } - printk("\n" KERN_ERR PFX "end of fw dump\n"); + printk(KERN_ERR PFX "end of fw dump\n"); } static void bnx2x_panic_dump(struct bnx2x *bp) @@ -7354,7 +7355,7 @@ static void bnx2x_reset_task(struct work_struct *work) #ifdef BNX2X_STOP_ON_ERROR BNX2X_ERR("reset task called but STOP_ON_ERROR defined" " so reset not done to allow debug dump,\n" - KERN_ERR " you will need to reboot when done\n"); + " you will need to reboot when done\n"); return; #endif diff --git a/drivers/net/dl2k.c b/drivers/net/dl2k.c index 895d72143ee0..4b6a219fecea 100644 --- a/drivers/net/dl2k.c +++ b/drivers/net/dl2k.c @@ -268,8 +268,9 @@ rio_probe1 (struct pci_dev *pdev, const struct pci_device_id *ent) printk(KERN_INFO "tx_coalesce:\t%d packets\n", tx_coalesce); if (np->coalesce) - printk(KERN_INFO "rx_coalesce:\t%d packets\n" - KERN_INFO "rx_timeout: \t%d ns\n", + printk(KERN_INFO + "rx_coalesce:\t%d packets\n" + "rx_timeout: \t%d ns\n", np->rx_coalesce, np->rx_timeout*640); if (np->vlan) printk(KERN_INFO "vlan(id):\t%d\n", np->vlan); @@ -1522,9 +1523,9 @@ mii_get_media (struct net_device *dev) printk (KERN_INFO "Operating at 10 Mbps, "); } if (bmcr & MII_BMCR_DUPLEX_MODE) { - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } } if (np->tx_flow) @@ -1614,9 +1615,9 @@ mii_set_media (struct net_device *dev) } if (np->full_duplex) { bmcr |= MII_BMCR_DUPLEX_MODE; - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } #if 0 /* Set 1000BaseT Master/Slave setting */ @@ -1669,9 +1670,9 @@ mii_get_media_pcs (struct net_device *dev) __u16 bmcr = mii_read (dev, phy_addr, PCS_BMCR); printk (KERN_INFO "Operating at 1000 Mbps, "); if (bmcr & MII_BMCR_DUPLEX_MODE) { - printk ("Full duplex\n"); + printk (KERN_CONT "Full duplex\n"); } else { - printk ("Half duplex\n"); + printk (KERN_CONT "Half duplex\n"); } } if (np->tx_flow) diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c index b60e27dfcfa7..88d7ebf31220 100644 --- a/drivers/net/epic100.c +++ b/drivers/net/epic100.c @@ -338,8 +338,7 @@ static int __devinit epic_init_one (struct pci_dev *pdev, #ifndef MODULE static int printed_version; if (!printed_version++) - printk (KERN_INFO "%s" KERN_INFO "%s", - version, version2); + printk(KERN_INFO "%s%s", version, version2); #endif card_idx++; @@ -1600,7 +1599,7 @@ static int __init epic_init (void) { /* when a module, this is printed whether or not devices are found in probe */ #ifdef MODULE - printk (KERN_INFO "%s" KERN_INFO "%s", + printk (KERN_INFO "%s%s", version, version2); #endif diff --git a/drivers/net/fealnx.c b/drivers/net/fealnx.c index 891be28a7d4f..053fb49820b9 100644 --- a/drivers/net/fealnx.c +++ b/drivers/net/fealnx.c @@ -1209,17 +1209,20 @@ static void fealnx_tx_timeout(struct net_device *dev) unsigned long flags; int i; - printk(KERN_WARNING "%s: Transmit timed out, status %8.8x," - " resetting...\n", dev->name, ioread32(ioaddr + ISR)); + printk(KERN_WARNING + "%s: Transmit timed out, status %8.8x, resetting...\n", + dev->name, ioread32(ioaddr + ISR)); { printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int) np->rx_ring[i].status); - printk("\n" KERN_DEBUG " Tx ring %p: ", np->tx_ring); + printk(PR_CONT " %8.8x", + (unsigned int) np->rx_ring[i].status); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG " Tx ring %p: ", np->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", np->tx_ring[i].status); - printk("\n"); + printk(PR_CONT " %4.4x", np->tx_ring[i].status); + printk(PR_CONT "\n"); } spin_lock_irqsave(&np->lock, flags); diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c index 9d5b62cb30f7..d62378cbc149 100644 --- a/drivers/net/hamachi.c +++ b/drivers/net/hamachi.c @@ -173,8 +173,8 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; static const char version[] __devinitconst = KERN_INFO DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " Written by Donald Becker\n" -KERN_INFO " Some modifications by Eric kasten \n" -KERN_INFO " Further modifications by Keith Underwood \n"; +" Some modifications by Eric kasten \n" +" Further modifications by Keith Underwood \n"; /* IP_MF appears to be only defined in , however, @@ -1080,11 +1080,14 @@ static void hamachi_tx_timeout(struct net_device *dev) { printk(KERN_DEBUG " Rx ring %p: ", hmp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", le32_to_cpu(hmp->rx_ring[i].status_n_length)); - printk("\n"KERN_DEBUG" Tx ring %p: ", hmp->tx_ring); + printk(KERN_CONT " %8.8x", + le32_to_cpu(hmp->rx_ring[i].status_n_length)); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG" Tx ring %p: ", hmp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", le32_to_cpu(hmp->tx_ring[i].status_n_length)); - printk("\n"); + printk(KERN_CONT " %4.4x", + le32_to_cpu(hmp->tx_ring[i].status_n_length)); + printk(KERN_CONT "\n"); } /* Reinit the hardware and make sure the Rx and Tx processes @@ -1753,13 +1756,13 @@ static int hamachi_close(struct net_device *dev) #ifdef __i386__ if (hamachi_debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %8.8x:\n", + printk(KERN_DEBUG " Tx ring at %8.8x:\n", (int)hmp->tx_ring_dma); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %c #%d desc. %8.8x %8.8x.\n", + printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x.\n", readl(ioaddr + TxCurPtr) == (long)&hmp->tx_ring[i] ? '>' : ' ', i, hmp->tx_ring[i].status_n_length, hmp->tx_ring[i].addr); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)hmp->rx_ring_dma); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " %c #%d desc. %4.4x %8.8x\n", @@ -1770,7 +1773,7 @@ static int hamachi_close(struct net_device *dev) u16 *addr = (u16 *) hmp->rx_skbuff[i]->data; int j; - + printk(KERN_DEBUG "Addr: "); for (j = 0; j < 0x50; j++) printk(" %4.4x", addr[j]); printk("\n"); diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 5e4b7afd0683..352703255bba 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -68,7 +68,7 @@ static const char paranoia_str[] = KERN_ERR static const char bc_drvname[] = "baycom_epp"; static const char bc_drvinfo[] = KERN_INFO "baycom_epp: (C) 1998-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_epp: version 0.7 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_epp: version 0.7 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c index 2e6fc4dc74b1..5f5af9a606f8 100644 --- a/drivers/net/hamradio/baycom_par.c +++ b/drivers/net/hamradio/baycom_par.c @@ -102,7 +102,7 @@ static const char bc_drvname[] = "baycom_par"; static const char bc_drvinfo[] = KERN_INFO "baycom_par: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_par: version 0.9 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_par: version 0.9 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index b6a816e60c0f..aa4488e871b2 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -91,7 +91,7 @@ static const char bc_drvname[] = "baycom_ser_fdx"; static const char bc_drvinfo[] = KERN_INFO "baycom_ser_fdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_ser_fdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_ser_fdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c index 3bcc57acbe6d..88c593596020 100644 --- a/drivers/net/hamradio/baycom_ser_hdx.c +++ b/drivers/net/hamradio/baycom_ser_hdx.c @@ -79,7 +79,7 @@ static const char bc_drvname[] = "baycom_ser_hdx"; static const char bc_drvinfo[] = KERN_INFO "baycom_ser_hdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA\n" -KERN_INFO "baycom_ser_hdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; +"baycom_ser_hdx: version 0.10 compiled " __TIME__ " " __DATE__ "\n"; /* --------------------------------------------------------------------- */ diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c index c9bfe4eea189..78c088331f57 100644 --- a/drivers/net/natsemi.c +++ b/drivers/net/natsemi.c @@ -130,8 +130,8 @@ static int full_duplex[MAX_UNITS]; static const char version[] __devinitconst = KERN_INFO DRV_NAME " dp8381x driver, version " DRV_VERSION ", " DRV_RELDATE "\n" - KERN_INFO " originally by Donald Becker \n" - KERN_INFO " 2.4.x kernel port by Jeff Garzik, Tjeerd Mulder\n"; + " originally by Donald Becker \n" + " 2.4.x kernel port by Jeff Garzik, Tjeerd Mulder\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("National Semiconductor DP8381x series PCI Ethernet driver"); diff --git a/drivers/net/ne.c b/drivers/net/ne.c index 5c3e242428f1..992dbfffdb05 100644 --- a/drivers/net/ne.c +++ b/drivers/net/ne.c @@ -321,7 +321,7 @@ static int __init ne_probe1(struct net_device *dev, unsigned long ioaddr) } if (ei_debug && version_printed++ == 0) - printk(KERN_INFO "%s" KERN_INFO "%s", version1, version2); + printk(KERN_INFO "%s%s", version1, version2); printk(KERN_INFO "NE*000 ethercard probe at %#3lx:", ioaddr); diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c index 8c1f6988f398..89f7b2ad5231 100644 --- a/drivers/net/pci-skeleton.c +++ b/drivers/net/pci-skeleton.c @@ -105,7 +105,7 @@ IVc. Errata static char version[] __devinitdata = KERN_INFO NETDRV_DRIVER_LOAD_MSG "\n" -KERN_INFO " Support available from http://foo.com/bar/baz.html\n"; +" Support available from http://foo.com/bar/baz.html\n"; /* define to 1 to enable PIO instead of MMIO */ #undef USE_IO_OPS diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c index f51944b28cfa..06618af1a468 100644 --- a/drivers/net/pcmcia/ibmtr_cs.c +++ b/drivers/net/pcmcia/ibmtr_cs.c @@ -298,14 +298,11 @@ static int __devinit ibmtr_config(struct pcmcia_device *link) strcpy(info->node.dev_name, dev->name); - printk(KERN_INFO "%s: port %#3lx, irq %d,", - dev->name, dev->base_addr, dev->irq); - printk (" mmio %#5lx,", (u_long)ti->mmio); - printk (" sram %#5lx,", (u_long)ti->sram_base << 12); - printk ("\n" KERN_INFO " hwaddr="); - for (i = 0; i < TR_ALEN; i++) - printk("%02X", dev->dev_addr[i]); - printk("\n"); + printk(KERN_INFO + "%s: port %#3lx, irq %d, mmio %#5lx, sram %#5lx, hwaddr=%pM\n", + dev->name, dev->base_addr, dev->irq, + (u_long)ti->mmio, (u_long)(ti->sram_base << 12), + dev->dev_addr); return 0; cs_failed: diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c index 02ef63ed1f99..36de91baf238 100644 --- a/drivers/net/pcmcia/nmclan_cs.c +++ b/drivers/net/pcmcia/nmclan_cs.c @@ -1425,15 +1425,12 @@ static void BuildLAF(int *ladrf, int *adr) ladrf[byte] |= (1 << (hashcode & 7)); #ifdef PCMCIA_DEBUG - if (pc_debug > 2) { - printk(KERN_DEBUG " adr ="); - for (i = 0; i < 6; i++) - printk(" %02X", adr[i]); - printk("\n" KERN_DEBUG " hashcode = %d(decimal), ladrf[0:63]" - " =", hashcode); - for (i = 0; i < 8; i++) - printk(" %02X", ladrf[i]); - printk("\n"); + if (pc_debug > 2) + printk(KERN_DEBUG " adr =%pM\n", adr); + printk(KERN_DEBUG " hashcode = %d(decimal), ladrf[0:63] =", hashcode); + for (i = 0; i < 8; i++) + printk(KERN_CONT " %02X", ladrf[i]); + printk(KERN_CONT "\n"); } #endif } /* BuildLAF */ diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 1c35e1d637a0..28368157dac4 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -485,7 +485,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, &new_ring_dma_addr); if (new_tx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Consistent memory allocation failed.\n", dev->name); return; @@ -496,7 +496,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_dma_addr_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_tx_ring; } @@ -505,7 +505,7 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_skb_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_lists; } @@ -563,7 +563,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, &new_ring_dma_addr); if (new_rx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Consistent memory allocation failed.\n", dev->name); return; @@ -574,7 +574,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_dma_addr_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_rx_ring; } @@ -583,7 +583,7 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, GFP_ATOMIC); if (!new_skb_list) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR + printk(KERN_ERR "%s: Memory allocation failed.\n", dev->name); goto free_new_lists; } @@ -1766,38 +1766,38 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) /* Version 0x2623 and 0x2624 */ if (((chip_version + 1) & 0xfffe) == 0x2624) { i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */ - printk("\n" KERN_INFO " tx_start_pt(0x%04x):", i); + printk(KERN_INFO " tx_start_pt(0x%04x):", i); switch (i >> 10) { case 0: - printk(" 20 bytes,"); + printk(KERN_CONT " 20 bytes,"); break; case 1: - printk(" 64 bytes,"); + printk(KERN_CONT " 64 bytes,"); break; case 2: - printk(" 128 bytes,"); + printk(KERN_CONT " 128 bytes,"); break; case 3: - printk("~220 bytes,"); + printk(KERN_CONT "~220 bytes,"); break; } i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */ - printk(" BCR18(%x):", i & 0xffff); + printk(KERN_CONT " BCR18(%x):", i & 0xffff); if (i & (1 << 5)) - printk("BurstWrEn "); + printk(KERN_CONT "BurstWrEn "); if (i & (1 << 6)) - printk("BurstRdEn "); + printk(KERN_CONT "BurstRdEn "); if (i & (1 << 7)) - printk("DWordIO "); + printk(KERN_CONT "DWordIO "); if (i & (1 << 11)) - printk("NoUFlow "); + printk(KERN_CONT "NoUFlow "); i = a->read_bcr(ioaddr, 25); - printk("\n" KERN_INFO " SRAMSIZE=0x%04x,", i << 8); + printk(KERN_INFO " SRAMSIZE=0x%04x,", i << 8); i = a->read_bcr(ioaddr, 26); - printk(" SRAM_BND=0x%04x,", i << 8); + printk(KERN_CONT " SRAM_BND=0x%04x,", i << 8); i = a->read_bcr(ioaddr, 27); if (i & (1 << 14)) - printk("LowLatRx"); + printk(KERN_CONT "LowLatRx"); } } @@ -1996,7 +1996,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) &lp->tx_ring_dma_addr); if (lp->tx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Consistent memory allocation failed.\n", name); return -ENOMEM; @@ -2008,7 +2008,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) &lp->rx_ring_dma_addr); if (lp->rx_ring == NULL) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Consistent memory allocation failed.\n", name); return -ENOMEM; @@ -2018,7 +2018,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->tx_dma_addr) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2027,7 +2027,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->rx_dma_addr) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2036,7 +2036,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->tx_skbuff) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } @@ -2045,7 +2045,7 @@ static int pcnet32_alloc_ring(struct net_device *dev, const char *name) GFP_ATOMIC); if (!lp->rx_skbuff) { if (netif_msg_drv(lp)) - printk("\n" KERN_ERR PFX + printk(KERN_ERR PFX "%s: Memory allocation failed.\n", name); return -ENOMEM; } diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c index 838cce8b8fff..669253c7bd41 100644 --- a/drivers/net/starfire.c +++ b/drivers/net/starfire.c @@ -180,7 +180,7 @@ static int full_duplex[MAX_UNITS] = {0, }; /* These identify the driver base version and may not be removed. */ static const char version[] __devinitconst = KERN_INFO "starfire.c:v1.03 7/26/2000 Written by Donald Becker \n" -KERN_INFO " (unofficial 2.2/2.4 kernel port, version " DRV_VERSION ", " DRV_RELDATE ")\n"; +" (unofficial 2.2/2.4 kernel port, version " DRV_VERSION ", " DRV_RELDATE ")\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Adaptec Starfire Ethernet driver"); diff --git a/drivers/net/sundance.c b/drivers/net/sundance.c index 545f81b34ad7..d1521c3875b2 100644 --- a/drivers/net/sundance.c +++ b/drivers/net/sundance.c @@ -1698,13 +1698,13 @@ static int netdev_close(struct net_device *dev) #ifdef __i386__ if (netif_msg_hw(np)) { - printk("\n"KERN_DEBUG" Tx ring at %8.8x:\n", + printk(KERN_DEBUG " Tx ring at %8.8x:\n", (int)(np->tx_ring_dma)); for (i = 0; i < TX_RING_SIZE; i++) - printk(" #%d desc. %4.4x %8.8x %8.8x.\n", + printk(KERN_DEBUG " #%d desc. %4.4x %8.8x %8.8x.\n", i, np->tx_ring[i].status, np->tx_ring[i].frag[0].addr, np->tx_ring[i].frag[0].length); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)(np->rx_ring_dma)); for (i = 0; i < /*RX_RING_SIZE*/4 ; i++) { printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n", diff --git a/drivers/net/tsi108_eth.c b/drivers/net/tsi108_eth.c index 0f78f99f9b20..7030bd5e9848 100644 --- a/drivers/net/tsi108_eth.c +++ b/drivers/net/tsi108_eth.c @@ -1132,7 +1132,9 @@ static int tsi108_get_mac(struct net_device *dev) } if (!is_valid_ether_addr(dev->dev_addr)) { - printk("KERN_ERR: word1: %08x, word2: %08x\n", word1, word2); + printk(KERN_ERR + "%s: Invalid MAC address. word1: %08x, word2: %08x\n", + dev->name, word1, word2); return -EINVAL; } @@ -1201,8 +1203,8 @@ static void tsi108_set_rx_mode(struct net_device *dev) __set_bit(hash, &data->mc_hash[0]); } else { printk(KERN_ERR - "%s: got multicast address of length %d " - "instead of 6.\n", dev->name, + "%s: got multicast address of length %d instead of 6.\n", + dev->name, mc->dmi_addrlen); } diff --git a/drivers/net/tulip/de2104x.c b/drivers/net/tulip/de2104x.c index 81f054dbb88d..ef49744a5085 100644 --- a/drivers/net/tulip/de2104x.c +++ b/drivers/net/tulip/de2104x.c @@ -944,9 +944,10 @@ static void de_set_media (struct de_private *de) macmode &= ~FullDuplex; if (netif_msg_link(de)) { - printk(KERN_INFO "%s: set link %s\n" - KERN_INFO "%s: mode 0x%x, sia 0x%x,0x%x,0x%x,0x%x\n" - KERN_INFO "%s: set mode 0x%x, set sia 0x%x,0x%x,0x%x\n", + printk(KERN_INFO + "%s: set link %s\n" + "%s: mode 0x%x, sia 0x%x,0x%x,0x%x,0x%x\n" + "%s: set mode 0x%x, set sia 0x%x,0x%x,0x%x\n", de->dev->name, media_name[media], de->dev->name, dr32(MacMode), dr32(SIAStatus), dr32(CSR13), dr32(CSR14), dr32(CSR15), diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c index 2abb5d3becc6..99a63649f4fc 100644 --- a/drivers/net/tulip/tulip_core.c +++ b/drivers/net/tulip/tulip_core.c @@ -570,16 +570,18 @@ static void tulip_tx_timeout(struct net_device *dev) (unsigned int)tp->rx_ring[i].buffer2, buf[0], buf[1], buf[2]); for (j = 0; buf[j] != 0xee && j < 1600; j++) - if (j < 100) printk(" %2.2x", buf[j]); - printk(" j=%d.\n", j); + if (j < 100) + printk(KERN_CONT " %2.2x", buf[j]); + printk(KERN_CONT " j=%d.\n", j); } printk(KERN_DEBUG " Rx ring %8.8x: ", (int)tp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)tp->rx_ring[i].status); - printk("\n" KERN_DEBUG " Tx ring %8.8x: ", (int)tp->tx_ring); + printk(KERN_CONT " %8.8x", + (unsigned int)tp->rx_ring[i].status); + printk(KERN_DEBUG " Tx ring %8.8x: ", (int)tp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)tp->tx_ring[i].status); - printk("\n"); + printk(KERN_CONT " %8.8x", (unsigned int)tp->tx_ring[i].status); + printk(KERN_CONT "\n"); } #endif diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c index 842b1a2c40d4..0f15773dae52 100644 --- a/drivers/net/tulip/winbond-840.c +++ b/drivers/net/tulip/winbond-840.c @@ -142,7 +142,7 @@ static int full_duplex[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; static const char version[] __initconst = KERN_INFO DRV_NAME ".c:v" DRV_VERSION " (2.4 port) " DRV_RELDATE " Donald Becker \n" - KERN_INFO " http://www.scyld.com/network/drivers.html\n"; + " http://www.scyld.com/network/drivers.html\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Winbond W89c840 Ethernet driver"); @@ -939,7 +939,7 @@ static void tx_timeout(struct net_device *dev) printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) printk(" %8.8x", (unsigned int)np->rx_ring[i].status); - printk("\n"KERN_DEBUG" Tx ring %p: ", np->tx_ring); + printk(KERN_DEBUG" Tx ring %p: ", np->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) printk(" %8.8x", np->tx_ring[i].status); printk("\n"); @@ -1520,7 +1520,7 @@ static int netdev_close(struct net_device *dev) printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x.\n", i, np->tx_ring[i].length, np->tx_ring[i].status, np->tx_ring[i].buffer1); - printk("\n"KERN_DEBUG " Rx ring %8.8x:\n", + printk(KERN_DEBUG " Rx ring %8.8x:\n", (int)np->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n", diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c index 223238de475c..1ea1ef6c3b96 100644 --- a/drivers/net/wan/hd64570.c +++ b/drivers/net/wan/hd64570.c @@ -584,8 +584,9 @@ static void sca_dump_rings(struct net_device *dev) sca_in(DSR_RX(phy_node(port)), card) & DSR_DE ? "" : "in"); for (cnt = 0; cnt < port_to_card(port)->rx_ring_buffers; cnt++) printk(" %02X", readb(&(desc_address(port, cnt, 0)->stat))); + printk(KERN_CONT "\n"); - printk("\n" KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " + printk(KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " "last=%u %sactive", sca_inw(get_dmac_tx(port) + CDAL, card), sca_inw(get_dmac_tx(port) + EDAL, card), diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c index 497b003d7239..f099c34a3ae2 100644 --- a/drivers/net/wan/hd64572.c +++ b/drivers/net/wan/hd64572.c @@ -529,8 +529,9 @@ static void sca_dump_rings(struct net_device *dev) sca_in(DSR_RX(port->chan), card) & DSR_DE ? "" : "in"); for (cnt = 0; cnt < port->card->rx_ring_buffers; cnt++) printk(" %02X", readb(&(desc_address(port, cnt, 0)->stat))); + printk(KERN_CONT "\n"); - printk("\n" KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " + printk(KERN_DEBUG "TX ring: CDA=%u EDA=%u DSR=%02X in=%u " "last=%u %sactive", sca_inl(get_dmac_tx(port) + CDAL, card), sca_inl(get_dmac_tx(port) + EDAL, card), diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 3fb9dbc88a1a..d14e95a08d66 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -326,11 +326,9 @@ sbni_pci_probe( struct net_device *dev ) } if (pci_irq_line <= 0 || pci_irq_line >= nr_irqs) - printk( KERN_WARNING " WARNING: The PCI BIOS assigned " - "this PCI card to IRQ %d, which is unlikely " - "to work!.\n" - KERN_WARNING " You should use the PCI BIOS " - "setup to assign a valid IRQ line.\n", + printk( KERN_WARNING + " WARNING: The PCI BIOS assigned this PCI card to IRQ %d, which is unlikely to work!.\n" + " You should use the PCI BIOS setup to assign a valid IRQ line.\n", pci_irq_line ); /* avoiding re-enable dual adapters */ diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index b10b0383dfa5..698b11b1cadb 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -2427,11 +2427,10 @@ static void untranslate(ray_dev_t *local, struct sk_buff *skb, int len) #ifdef PCMCIA_DEBUG if (pc_debug > 3) { - int i; - printk(KERN_DEBUG "skb->data before untranslate"); - for (i = 0; i < 64; i++) - printk("%02x ", skb->data[i]); - printk("\n" KERN_DEBUG + print_hex_dump(KERN_DEBUG, "skb->data before untranslate: ", + DUMP_PREFIX_NONE, 16, 1, + skb->data, 64, true); + printk(KERN_DEBUG "type = %08x, xsap = %02x%02x%02x, org = %02x02x02x\n", ntohs(type), psnap->dsap, psnap->ssap, psnap->ctrl, psnap->org[0], psnap->org[1], psnap->org[2]); diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c index 6af706408ac0..c6d300666ad8 100644 --- a/drivers/net/wireless/wavelan_cs.c +++ b/drivers/net/wireless/wavelan_cs.c @@ -3556,17 +3556,8 @@ wv_82593_config(struct net_device * dev) cfblk.rcvstop = TRUE; /* Enable Receive Stop Register */ #ifdef DEBUG_I82593_SHOW - { - u_char *c = (u_char *) &cfblk; - int i; - printk(KERN_DEBUG "wavelan_cs: config block:"); - for(i = 0; i < sizeof(struct i82593_conf_block); i++,c++) - { - if((i % 16) == 0) printk("\n" KERN_DEBUG); - printk("%02x ", *c); - } - printk("\n"); - } + print_hex_dump(KERN_DEBUG, "wavelan_cs: config block: ", DUMP_PREFIX_NONE, + 16, 1, &cfblk, sizeof(struct i82593_conf_block), false); #endif /* Copy the config block to the i82593 */ diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c index 3c7a5053f1da..a07580138e81 100644 --- a/drivers/net/yellowfin.c +++ b/drivers/net/yellowfin.c @@ -109,7 +109,7 @@ static int gx_fix; /* These identify the driver base version and may not be removed. */ static const char version[] __devinitconst = KERN_INFO DRV_NAME ".c:v1.05 1/09/2001 Written by Donald Becker \n" - KERN_INFO " (unofficial 2.4.x port, " DRV_VERSION ", " DRV_RELDATE ")\n"; + " (unofficial 2.4.x port, " DRV_VERSION ", " DRV_RELDATE ")\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("Packet Engines Yellowfin G-NIC Gigabit Ethernet driver"); @@ -700,12 +700,15 @@ static void yellowfin_tx_timeout(struct net_device *dev) int i; printk(KERN_WARNING " Rx ring %p: ", yp->rx_ring); for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", yp->rx_ring[i].result_status); - printk("\n"KERN_WARNING" Tx ring %p: ", yp->tx_ring); + printk(KERN_CONT " %8.8x", + yp->rx_ring[i].result_status); + printk(KERN_CONT "\n"); + printk(KERN_WARNING" Tx ring %p: ", yp->tx_ring); for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x /%8.8x", yp->tx_status[i].tx_errs, - yp->tx_ring[i].result_status); - printk("\n"); + printk(KERN_CONT " %4.4x /%8.8x", + yp->tx_status[i].tx_errs, + yp->tx_ring[i].result_status); + printk(KERN_CONT "\n"); } /* If the hardware is found to hang regularly, we will update the code @@ -1216,20 +1219,20 @@ static int yellowfin_close(struct net_device *dev) #if defined(__i386__) if (yellowfin_debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %8.8llx:\n", + printk(KERN_DEBUG" Tx ring at %8.8llx:\n", (unsigned long long)yp->tx_ring_dma); for (i = 0; i < TX_RING_SIZE*2; i++) - printk(" %c #%d desc. %8.8x %8.8x %8.8x %8.8x.\n", + printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x %8.8x %8.8x.\n", ioread32(ioaddr + TxPtr) == (long)&yp->tx_ring[i] ? '>' : ' ', i, yp->tx_ring[i].dbdma_cmd, yp->tx_ring[i].addr, yp->tx_ring[i].branch_addr, yp->tx_ring[i].result_status); printk(KERN_DEBUG " Tx status %p:\n", yp->tx_status); for (i = 0; i < TX_RING_SIZE; i++) - printk(" #%d status %4.4x %4.4x %4.4x %4.4x.\n", + printk(KERN_DEBUG " #%d status %4.4x %4.4x %4.4x %4.4x.\n", i, yp->tx_status[i].tx_cnt, yp->tx_status[i].tx_errs, yp->tx_status[i].total_tx_cnt, yp->tx_status[i].paused); - printk("\n"KERN_DEBUG " Rx ring %8.8llx:\n", + printk(KERN_DEBUG " Rx ring %8.8llx:\n", (unsigned long long)yp->rx_ring_dma); for (i = 0; i < RX_RING_SIZE; i++) { printk(KERN_DEBUG " %c #%d desc. %8.8x %8.8x %8.8x\n", diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c index c709ecc2b7f7..0be1d50645ab 100644 --- a/drivers/parisc/eisa_enumerator.c +++ b/drivers/parisc/eisa_enumerator.c @@ -101,7 +101,7 @@ static int configure_memory(const unsigned char *buf, printk("memory %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(mem_parent, res); if (result < 0) { - printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); + printk(KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); return result; } } @@ -191,7 +191,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent, printk("ioports %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end); result = request_resource(io_parent, res); if (result < 0) { - printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); + printk(KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n"); return result; } } @@ -224,7 +224,7 @@ static int configure_port_init(const unsigned char *buf) case HPEE_PORT_INIT_WIDTH_BYTE: s=1; if (c & HPEE_PORT_INIT_MASK) { - printk("\n" KERN_WARNING "port_init: unverified mask attribute\n"); + printk(KERN_WARNING "port_init: unverified mask attribute\n"); outb((inb(get_16(buf+len+1) & get_8(buf+len+3)) | get_8(buf+len+4)), get_16(buf+len+1)); @@ -249,7 +249,7 @@ static int configure_port_init(const unsigned char *buf) case HPEE_PORT_INIT_WIDTH_DWORD: s=4; if (c & HPEE_PORT_INIT_MASK) { - printk("\n" KERN_WARNING "port_init: unverified mask attribute\n"); + printk(KERN_WARNING "port_init: unverified mask attribute\n"); outl((inl(get_16(buf+len+1) & get_32(buf+len+3)) | get_32(buf+len+7)), get_16(buf+len+1)); @@ -259,7 +259,7 @@ static int configure_port_init(const unsigned char *buf) break; default: - printk("\n" KERN_ERR "Invalid port init word %02x\n", c); + printk(KERN_ERR "Invalid port init word %02x\n", c); return 0; } @@ -297,7 +297,7 @@ static int configure_type_string(const unsigned char *buf) /* just skip past the type field */ len = get_8(buf); if (len > 80) { - printk("\n" KERN_ERR "eisa_enumerator: type info field too long (%d, max is 80)\n", len); + printk(KERN_ERR "eisa_enumerator: type info field too long (%d, max is 80)\n", len); } return 1+len; @@ -398,7 +398,7 @@ static int parse_slot_config(int slot, } if (p0 + function_len < pos) { - printk("\n" KERN_ERR "eisa_enumerator: function %d length mis-match " + printk(KERN_ERR "eisa_enumerator: function %d length mis-match " "got %d, expected %d\n", num_func, pos-p0, function_len); res=-1; diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c index 9ad97ea836e8..8eb04230fec7 100644 --- a/drivers/pcmcia/tcic.c +++ b/drivers/pcmcia/tcic.c @@ -472,7 +472,8 @@ static int __init init_tcic(void) init_timer(&poll_timer); /* Build interrupt mask */ - printk(", %d sockets\n" KERN_INFO " irq list (", sockets); + printk(KERN_CONT ", %d sockets\n", sockets); + printk(KERN_INFO " irq list ("); if (irq_list_count == 0) mask = irq_mask; else diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c index 0471f8800483..4240b05aef6d 100644 --- a/drivers/scsi/atari_NCR5380.c +++ b/drivers/scsi/atari_NCR5380.c @@ -2826,8 +2826,7 @@ int NCR5380_abort(Scsi_Cmnd *cmd) */ local_irq_restore(flags); - printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully\n" - KERN_INFO " before abortion\n", HOSTNO); + printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO); /* Maybe it is sufficient just to release the ST-DMA lock... (if * possible at all) At least, we should check if the lock could be diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c index b12ad7c7c673..18735b39b3d3 100644 --- a/drivers/scsi/mac53c94.c +++ b/drivers/scsi/mac53c94.c @@ -75,8 +75,9 @@ static int mac53c94_queue(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd * int i; printk(KERN_DEBUG "mac53c94_queue %p: command is", cmd); for (i = 0; i < cmd->cmd_len; ++i) - printk(" %.2x", cmd->cmnd[i]); - printk("\n" KERN_DEBUG "use_sg=%d request_bufflen=%d request_buffer=%p\n", + printk(KERN_CONT " %.2x", cmd->cmnd[i]); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "use_sg=%d request_bufflen=%d request_buffer=%p\n", scsi_sg_count(cmd), scsi_bufflen(cmd), scsi_sglist(cmd)); } #endif diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ef142fd47a83..4d6f2fe1cfe9 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -619,7 +619,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) if (strcmp(current->comm, cmd) && printk_ratelimit()) { printk(KERN_WARNING "sg_write: data in/out %d/%d bytes for SCSI command 0x%x--" - "guessing data in;\n" KERN_WARNING " " + "guessing data in;\n " "program %s not setting count and/or reply_len properly\n", old_hdr.reply_len - (int)SZ_SG_HEADER, input_size, (unsigned int) cmnd[0], diff --git a/drivers/scsi/sun3_NCR5380.c b/drivers/scsi/sun3_NCR5380.c index bcaba86060ab..75da6e58ce55 100644 --- a/drivers/scsi/sun3_NCR5380.c +++ b/drivers/scsi/sun3_NCR5380.c @@ -2860,8 +2860,7 @@ static int NCR5380_abort(struct scsi_cmnd *cmd) */ local_irq_restore(flags); - printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully\n" - KERN_INFO " before abortion\n", HOSTNO); + printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO); return SCSI_ABORT_NOT_RUNNING; } diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index 6160e03f410c..e7108e75653d 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -60,11 +60,12 @@ struct serial_private { static void moan_device(const char *str, struct pci_dev *dev) { - printk(KERN_WARNING "%s: %s\n" - KERN_WARNING "Please send the output of lspci -vv, this\n" - KERN_WARNING "message (0x%04x,0x%04x,0x%04x,0x%04x), the\n" - KERN_WARNING "manufacturer and name of serial board or\n" - KERN_WARNING "modem board to rmk+serial@arm.linux.org.uk.\n", + printk(KERN_WARNING + "%s: %s\n" + "Please send the output of lspci -vv, this\n" + "message (0x%04x,0x%04x,0x%04x,0x%04x), the\n" + "manufacturer and name of serial board or\n" + "modem board to rmk+serial@arm.linux.org.uk.\n", pci_name(dev), str, dev->vendor, dev->device, dev->subsystem_vendor, dev->subsystem_device); } diff --git a/drivers/ssb/pcmcia.c b/drivers/ssb/pcmcia.c index fbfadbac67e8..131030f693c7 100644 --- a/drivers/ssb/pcmcia.c +++ b/drivers/ssb/pcmcia.c @@ -583,7 +583,7 @@ static int ssb_pcmcia_sprom_write_all(struct ssb_bus *bus, const u16 *sprom) ssb_printk("."); err = ssb_pcmcia_sprom_write(bus, i, sprom[i]); if (err) { - ssb_printk("\n" KERN_NOTICE PFX + ssb_printk(KERN_NOTICE PFX "Failed to write to SPROM.\n"); failed = 1; break; @@ -591,7 +591,7 @@ static int ssb_pcmcia_sprom_write_all(struct ssb_bus *bus, const u16 *sprom) } err = ssb_pcmcia_sprom_command(bus, SSB_PCMCIA_SPROMCTL_WRITEDIS); if (err) { - ssb_printk("\n" KERN_NOTICE PFX + ssb_printk(KERN_NOTICE PFX "Could not disable SPROM write access.\n"); failed = 1; } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index ce3f453f02ef..95ccfa0b9fc5 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -648,7 +648,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd) struct urb *urb; int length; unsigned long flags; - char buffer[4]; /* Any root hubs with > 31 ports? */ + char buffer[6]; /* Any root hubs with > 31 ports? */ if (unlikely(!hcd->rh_registered)) return; diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c index fb8163d181ab..a21efcd10b78 100644 --- a/drivers/video/amba-clcd.c +++ b/drivers/video/amba-clcd.c @@ -226,9 +226,10 @@ static int clcdfb_set_par(struct fb_info *info) clcdfb_enable(fb, regs.cntl); #ifdef DEBUG - printk(KERN_INFO "CLCD: Registers set to\n" - KERN_INFO " %08x %08x %08x %08x\n" - KERN_INFO " %08x %08x %08x %08x\n", + printk(KERN_INFO + "CLCD: Registers set to\n" + " %08x %08x %08x %08x\n" + " %08x %08x %08x %08x\n", readl(fb->regs + CLCD_TIM0), readl(fb->regs + CLCD_TIM1), readl(fb->regs + CLCD_TIM2), readl(fb->regs + CLCD_TIM3), readl(fb->regs + CLCD_UBAS), readl(fb->regs + CLCD_LBAS), diff --git a/drivers/video/matrox/matroxfb_DAC1064.c b/drivers/video/matrox/matroxfb_DAC1064.c index 0ce3b0a89798..a74e5da17aa0 100644 --- a/drivers/video/matrox/matroxfb_DAC1064.c +++ b/drivers/video/matrox/matroxfb_DAC1064.c @@ -454,9 +454,9 @@ static void DAC1064_restore_2(WPMINFO2) { dprintk(KERN_DEBUG "DAC1064regs "); for (i = 0; i < sizeof(MGA1064_DAC_regs); i++) { dprintk("R%02X=%02X ", MGA1064_DAC_regs[i], ACCESS_FBINFO(hw).DACreg[i]); - if ((i & 0x7) == 0x7) dprintk("\n" KERN_DEBUG "continuing... "); + if ((i & 0x7) == 0x7) dprintk(KERN_DEBUG "continuing... "); } - dprintk("\n" KERN_DEBUG "DAC1064clk "); + dprintk(KERN_DEBUG "DAC1064clk "); for (i = 0; i < 6; i++) dprintk("C%02X=%02X ", i, ACCESS_FBINFO(hw).DACclk[i]); dprintk("\n"); diff --git a/drivers/video/matrox/matroxfb_Ti3026.c b/drivers/video/matrox/matroxfb_Ti3026.c index 13524821e242..4e825112a601 100644 --- a/drivers/video/matrox/matroxfb_Ti3026.c +++ b/drivers/video/matrox/matroxfb_Ti3026.c @@ -651,9 +651,9 @@ static void Ti3026_restore(WPMINFO2) { dprintk(KERN_DEBUG "3026DACregs "); for (i = 0; i < 21; i++) { dprintk("R%02X=%02X ", DACseq[i], hw->DACreg[i]); - if ((i & 0x7) == 0x7) dprintk("\n" KERN_DEBUG "continuing... "); + if ((i & 0x7) == 0x7) dprintk(KERN_DEBUG "continuing... "); } - dprintk("\n" KERN_DEBUG "DACclk "); + dprintk(KERN_DEBUG "DACclk "); for (i = 0; i < 6; i++) dprintk("C%02X=%02X ", i, hw->DACclk[i]); dprintk("\n"); diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c index eec9dcb7f599..6120f0c526fe 100644 --- a/drivers/video/stifb.c +++ b/drivers/video/stifb.c @@ -1115,10 +1115,9 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) if the device name contains the string "DX" and tell the user how to reconfigure the card. */ if (strstr(sti->outptr.dev_name, "DX")) { - printk(KERN_WARNING "WARNING: stifb framebuffer driver does not " - "support '%s' in double-buffer mode.\n" - KERN_WARNING "WARNING: Please disable the double-buffer mode " - "in IPL menu (the PARISC-BIOS).\n", + printk(KERN_WARNING +"WARNING: stifb framebuffer driver does not support '%s' in double-buffer mode.\n" +"WARNING: Please disable the double-buffer mode in IPL menu (the PARISC-BIOS).\n", sti->outptr.dev_name); goto out_err0; } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a0244740b75a..b47679be118a 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, D2({ int i=0; struct jffs2_raw_node_ref *this; - printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG); + printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n"); this = ic->nodes; + printk(KERN_DEBUG); while(this) { - printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this)); + printk(KERN_CONT "0x%08x(%d)->", + ref_offset(this), ref_flags(this)); if (++i == 5) { - printk("\n" KERN_DEBUG); + printk(KERN_DEBUG); i=0; } this = this->next_in_ino; } - printk("\n"); + printk(KERN_CONT "\n"); }); switch (ic->class) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2b..0a049837008e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2451,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return ret; } if (ret > 0) { - printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " - "it should follow 0/-E convention\n" - KERN_WARNING "%s: loading module anyway...\n", + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", __func__, mod->name, ret, __func__); dump_stack(); diff --git a/sound/pci/emu10k1/p16v.c b/sound/pci/emu10k1/p16v.c index e617acaf10e3..61b8ab39800f 100644 --- a/sound/pci/emu10k1/p16v.c +++ b/sound/pci/emu10k1/p16v.c @@ -644,7 +644,7 @@ int __devinit snd_p16v_pcm(struct snd_emu10k1 *emu, int device, struct snd_pcm * int err; int capture=1; - /* snd_printk("KERN_DEBUG snd_p16v_pcm called. device=%d\n", device); */ + /* snd_printk(KERN_DEBUG "snd_p16v_pcm called. device=%d\n", device); */ emu->p16v_device_offset = device; if (rpcm) *rpcm = NULL; diff --git a/sound/usb/usx2y/usbusx2yaudio.c b/sound/usb/usx2y/usbusx2yaudio.c index dd1ab6177840..9efd27f6b52f 100644 --- a/sound/usb/usx2y/usbusx2yaudio.c +++ b/sound/usb/usx2y/usbusx2yaudio.c @@ -296,9 +296,10 @@ static void usX2Y_error_urb_status(struct usX2Ydev *usX2Y, static void usX2Y_error_sequence(struct usX2Ydev *usX2Y, struct snd_usX2Y_substream *subs, struct urb *urb) { - snd_printk(KERN_ERR "Sequence Error!(hcd_frame=%i ep=%i%s;wait=%i,frame=%i).\n" - KERN_ERR "Most propably some urb of usb-frame %i is still missing.\n" - KERN_ERR "Cause could be too long delays in usb-hcd interrupt handling.\n", + snd_printk(KERN_ERR +"Sequence Error!(hcd_frame=%i ep=%i%s;wait=%i,frame=%i).\n" +"Most propably some urb of usb-frame %i is still missing.\n" +"Cause could be too long delays in usb-hcd interrupt handling.\n", usb_get_current_frame_number(usX2Y->chip.dev), subs->endpoint, usb_pipein(urb->pipe) ? "in" : "out", usX2Y->wait_iso_frame, urb->start_frame, usX2Y->wait_iso_frame); -- cgit v1.2.3 From 264ef8a904943ed7d0b04fa958894d7a5c2b2c61 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2009 10:33:01 +0100 Subject: kmemleak: Remove alloc_bootmem annotations introduced in the past kmemleak_alloc() calls were added in some places where alloc_bootmem was called. Since now kmemleak tracks bootmem allocations, these explicit calls should be run. Signed-off-by: Catalin Marinas Cc: Ingo Molnar Acked-by: Pekka Enberg --- kernel/pid.c | 7 ------- mm/page_alloc.c | 14 +++----------- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 5fa1db48d8b7..31310b5d3f50 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,7 +36,6 @@ #include #include #include -#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -513,12 +512,6 @@ void __init pidhash_init(void) pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); if (!pid_hash) panic("Could not alloc pidhash!\n"); - /* - * pid_hash contains references to allocated struct pid objects and it - * must be scanned by kmemleak to avoid false positives. - */ - kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0, - GFP_KERNEL); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..3ef628845f07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4745,8 +4745,10 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4764,16 +4766,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } -- cgit v1.2.3 From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:06:56 +0200 Subject: perf_counter: Stop open coding unclone_ctx Instead of open coding the unclone context thingy, put it in a common function. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da2347..8bf997d86bf4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx) } } +static void unclone_ctx(struct perf_counter_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task) /* * Unclone this context if we enabled any counter. */ - if (enabled && ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } + if (enabled) + unclone_ctx(ctx); spin_unlock(&ctx->lock); @@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *parent_ctx; struct perf_counter_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; @@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { - parent_ctx = ctx->parent_ctx; - if (parent_ctx) { - put_ctx(parent_ctx); - ctx->parent_ctx = NULL; /* no longer a clone */ - } + unclone_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child) */ spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; - if (child_ctx->parent_ctx) { - /* - * This context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - put_ctx(child_ctx->parent_ctx); - child_ctx->parent_ctx = NULL; - } + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + unclone_ctx(child_ctx); spin_unlock(&child_ctx->lock); local_irq_restore(flags); -- cgit v1.2.3 From a1ba4d8ba9f06a397e97cbd67a93ee306860b40a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2009 18:40:15 +0200 Subject: sched_rt: Fix overload bug on rt group scheduling Fixes an easily triggerable BUG() when setting process affinities. Make sure to count the number of migratable tasks in the same place: the root rt_rq. Otherwise the number doesn't make sense and we'll hit the BUG in set_cpus_allowed_rt(). Also, make sure we only count tasks, not groups (this is probably already taken care of by the fact that rt_se->nr_cpus_allowed will be 0 for groups, but be more explicit) Tested-by: Thomas Gleixner CC: stable@kernel.org Signed-off-by: Peter Zijlstra Acked-by: Gregory Haskins LKML-Reference: <1247067476.9777.57.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_rt.c | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..a17f3d9a8bfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -493,6 +493,7 @@ struct rt_rq { #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; + unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9bf0d2a73045..3918e01994e0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) #ifdef CONFIG_RT_GROUP_SCHED +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #else /* CONFIG_RT_GROUP_SCHED */ +#define rt_entity_is_task(rt_se) (1) + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); @@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq) static void update_rt_migration(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { + if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { if (!rt_rq->overloaded) { rt_set_overload(rq_of_rt_rq(rt_rq)); rt_rq->overloaded = 1; @@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total++; if (rt_se->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; @@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total--; if (rt_se->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; -- cgit v1.2.3 From 7793527b90d9418211f4fe8464cc1dcb1631ea1b Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 9 Jul 2009 13:57:20 +0200 Subject: sched: Reset sched stats on fork() The sched_stat fields are currently not reset upon fork. Ingo's recent commit 6c594c21fcb02c662f11c97be4d7d2b73060a205 did reset nr_migrations, but it didn't reset any of the others. This patch resets all sched_stat fields on fork. Signed-off-by: Lucas De Marchi Signed-off-by: Peter Zijlstra LKML-Reference: <193b0f820907090457s7a3662f4gcdecdc22fcae857b@mail.gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a17f3d9a8bfa..c4549bd7e174 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2572,15 +2572,37 @@ static void __sched_fork(struct task_struct *p) p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; + p->se.wait_start = 0; + p->se.wait_max = 0; + p->se.wait_count = 0; + p->se.wait_sum = 0; + + p->se.sleep_start = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + + p->se.block_start = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + #endif INIT_LIST_HEAD(&p->rt.run_list); -- cgit v1.2.3 From c20b08e3986c2dbfa6df1e880bf4f7159994d199 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Mon, 15 Jun 2009 20:56:38 +0200 Subject: sched: Fix rt_rq->pushable_tasks initialization in init_rt_rq() init_rt_rq() initializes only rq->rt.pushable_tasks, and not the pushable_tasks field of the passed rt_rq. The plist is not used uninitialized since the only pushable_tasks plists used are the ones of root rt_rqs; anyway reinitializing the list on every group creation corrupts the root plist, losing its previous contents. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: <20090615185638.GK21741@gandalf.sssup.it> CC: Gregory Haskins Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4549bd7e174..efecfdad1b5f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9093,7 +9093,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rq->rt.pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; -- cgit v1.2.3 From 7e0c5086c172ecf8b0c2ad860b02a586967d17d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jul 2009 13:52:32 +0200 Subject: hrtimer: migration: do not check expiry time on current CPU The timer migration code needs to check whether the expiry time of the timer is before the programmed clock event expiry time when the timer is enqueued on another CPU because we can not reprogram the timer device on the other CPU. The current logic checks the expiry time even if we enqueue on the current CPU when nohz_get_load_balancer() returns current CPU. This might lead to an endless loop in the expiry check code when the expiry time of the timer is before the current programmed next event. Check whether nohz_get_load_balancer() returns current CPU and skip the expiry check if this is the case. The bug was triggered from the networking code. The patch fixes the regression http://bugzilla.kernel.org/show_bug.cgi?id=13738 (Soft-Lockup/Race in networking in 2.6.31-rc1+195) Cc: Arun Bharadwaj Tested-by: Andres Freund Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958a96e7..126b9808f287 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -206,8 +206,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { preferred_cpu = get_nohz_load_balancer(); - if (preferred_cpu >= 0) - cpu = preferred_cpu; + if (preferred_cpu >= 0) { + /* + * We must not check the expiry value when + * preferred_cpu is the current cpu. If base + * != new_base we would loop forever when the + * timer expires before the current programmed + * next timer event. + */ + if (preferred_cpu != cpu) + cpu = preferred_cpu; + else + preferred_cpu = -1; + } } #endif -- cgit v1.2.3 From 6ff7041dbfeb3bd7dfe9aa67275c21199ef760d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jul 2009 14:57:05 +0200 Subject: hrtimer: Fix migration expiry check The timer migration expiry check should prevent the migration of a timer to another CPU when the timer expires before the next event is scheduled on the other CPU. Migrating the timer might delay it because we can not reprogram the clock event device on the other CPU. But the code implementing that check has two flaws: - for !HIGHRES the check compares the expiry value with the clock events device expiry value which is wrong for CLOCK_REALTIME based timers. - the check is racy. It holds the hrtimer base lock of the target CPU, but the clock event device expiry value can be modified nevertheless, e.g. by an timer interrupt firing. The !HIGHRES case is easy to fix as we can enqueue the timer on the cpu which was selected by the load balancer. It runs the idle balancing code once per jiffy anyway. So the maximum delay for the timer is the same as when we keep the tick on the current cpu going. In the HIGHRES case we can get the next expiry value from the hrtimer cpu_base of the target CPU and serialize the update with the cpu_base lock. This moves the lock section in hrtimer_interrupt() so we can set next_event to KTIME_MAX while we are handling the expired timers and set it to the next expiry value after we handled the timers under the base lock. While the expired timers are processed timer migration is blocked because the expiry time of the timer is always <= KTIME_MAX. Also remove the now useless clockevents_get_next_event() function. Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 9 ---- kernel/hrtimer.c | 121 ++++++++++++++++++++++++--------------------- kernel/time/clockevents.c | 11 ----- 3 files changed, 64 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 20a100fe2b4f..3a1dbba4d3ae 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -143,12 +143,3 @@ extern void clockevents_notify(unsigned long reason, void *arg); #endif #endif - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -extern ktime_t clockevents_get_next_event(int cpu); -#else -static inline ktime_t clockevents_get_next_event(int cpu) -{ - return (ktime_t) { .tv64 = KTIME_MAX }; -} -#endif diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 126b9808f287..49da79ab8486 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } + +/* + * Get the preferred target CPU for NOHZ + */ +static int hrtimer_get_target(int this_cpu, int pinned) +{ +#ifdef CONFIG_NO_HZ + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + return preferred_cpu; + } +#endif + return this_cpu; +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + /* * Switch the timer base to the current CPU when possible. */ @@ -200,27 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; - int cpu, preferred_cpu = -1; - - cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - preferred_cpu = get_nohz_load_balancer(); - if (preferred_cpu >= 0) { - /* - * We must not check the expiry value when - * preferred_cpu is the current cpu. If base - * != new_base we would loop forever when the - * timer expires before the current programmed - * next timer event. - */ - if (preferred_cpu != cpu) - cpu = preferred_cpu; - else - preferred_cpu = -1; - } - } -#endif + int this_cpu = smp_processor_id(); + int cpu = hrtimer_get_target(this_cpu, pinned); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -228,7 +249,7 @@ again: if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq @@ -244,38 +265,12 @@ again: spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); - /* Optimized away for NOHZ=n SMP=n */ - if (cpu == preferred_cpu) { - /* Calculate clock monotonic expiry time */ -#ifdef CONFIG_HIGH_RES_TIMERS - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), - new_base->offset); -#else - ktime_t expires = hrtimer_get_expires(timer); -#endif - - /* - * Get the next event on target cpu from the - * clock events layer. - * This covers the highres=off nohz=on case as well. - */ - ktime_t next = clockevents_get_next_event(cpu); - - ktime_t delta = ktime_sub(expires, next); - - /* - * We do not migrate the timer when it is expiring - * before the next event on the target cpu because - * we cannot reprogram the target cpu hardware and - * we would cause it to fire late. - */ - if (delta.tv64 < 0) { - cpu = smp_processor_id(); - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); - timer->base = base; - goto again; - } + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; } timer->base = new_base; } @@ -1287,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) expires_next.tv64 = KTIME_MAX; + spin_lock(&cpu_base->lock); + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + base = cpu_base->clock_base; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; struct rb_node *node; - spin_lock(&cpu_base->lock); - basenow = ktime_add(now, base->offset); while ((node = base->first)) { @@ -1327,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) __run_hrtimer(timer); } - spin_unlock(&cpu_base->lock); base++; } + /* + * Store the new expiry value so the migration code can verify + * against it. + */ cpu_base->expires_next = expires_next; + spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ad6dd461119..a6dcd67b041d 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - -ktime_t clockevents_get_next_event(int cpu) -{ - struct tick_device *td; - struct clock_event_device *dev; - - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - - return dev->next_event; -} #endif -- cgit v1.2.3 From d86ee4809d0329d4aa0d0f2c76c2295a16862799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:57 +0200 Subject: sched: optimize cond_resched() Optimize cond_resched() by removing one conditional. Currently cond_resched() checks system_state == SYSTEM_RUNNING in order to avoid scheduling before the scheduler is running. We can however, as per suggestion of Matt, use PREEMPT_ACTIVE to accomplish that very same. Suggested-by: Matt Mackall Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- kernel/sched.c | 14 +++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a99f1c15cf8..16a982e389fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -501,8 +501,11 @@ struct task_cputime { /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). + * + * We include PREEMPT_ACTIVE to avoid cond_resched() from working + * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1) +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..01f55ada3598 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6541,6 +6541,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6560,8 +6565,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { + if (should_resched()) { __cond_resched(); return 1; } @@ -6579,12 +6583,12 @@ EXPORT_SYMBOL(_cond_resched); */ int cond_resched_lock(spinlock_t *lock) { - int resched = need_resched() && system_state == SYSTEM_RUNNING; + int resched = should_resched(); int ret = 0; if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched && need_resched()) + if (resched) __cond_resched(); else cpu_relax(); @@ -6599,7 +6603,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && system_state == SYSTEM_RUNNING) { + if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); -- cgit v1.2.3 From d07387b490b1c43bfcb9f3900faf96f2dafb2630 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 10 Jul 2009 17:05:16 -0700 Subject: sched: Fix bug in SCHED_IDLE interaction with group scheduling One of the isolation modifications for SCHED_IDLE is the unitization of sleeper credit. However the check for this assumes that the sched_entity we're placing always belongs to a task. This is potentially not true with group scheduling and leaves us rummaging randomly when we try to pull the policy. Signed-off-by: Paul Turner Cc: peterz@infradead.org LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba7fd6e9556f..7c248dc30f41 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -687,7 +687,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * all of which have the same weight. */ if (sched_feat(NORMALIZED_SLEEPER) && - task_of(se)->policy != SCHED_IDLE) + (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) thresh = calc_delta_fair(thresh, se); vruntime -= thresh; -- cgit v1.2.3 From ce2ae53b750abfaa012ce408e93da131a5b5649b Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Fri, 10 Jul 2009 18:13:13 -0500 Subject: futexes: Fix infinite loop in get_futex_key() on huge page get_futex_key() can infinitely loop if it is called on a virtual address that is within a huge page but not aligned to the beginning of that page. The call to get_user_pages_fast will return the struct page for a sub-page within the huge page and the check for page->mapping will always fail. The fix is to call compound_head on the page before checking that it's mapped. Signed-off-by: Sonny Rao Acked-by: Thomas Gleixner Cc: stable@kernel.org Cc: anton@samba.org Cc: rajamony@us.ibm.com Cc: speight@us.ibm.com Cc: mstephen@us.ibm.com Cc: grimm@us.ibm.com Cc: mikey@ozlabs.au.ibm.com LKML-Reference: <20090710231313.GA23572@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 794c862125fe..0672ff88f159 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -247,6 +247,7 @@ again: if (err < 0) return err; + page = compound_head(page); lock_page(page); if (!page->mapping) { unlock_page(page); -- cgit v1.2.3 From 405f55712dfe464b3240d7816cc4fe4174831be2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 11 Jul 2009 22:08:37 +0400 Subject: headers: smp_lock.h redux * Remove smp_lock.h from files which don't need it (including some headers!) * Add smp_lock.h to files which do need it * Make smp_lock.h include conditional in hardirq.h It's needed only for one kernel_locked() usage which is under CONFIG_PREEMPT This will make hardirq.h inclusion cheaper for every PREEMPT=n config (which includes allmodconfig/allyesconfig, BTW) Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 1 - arch/blackfin/kernel/ptrace.c | 1 - arch/blackfin/kernel/sys_bfin.c | 1 - arch/cris/kernel/sys_cris.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/m32r/kernel/ptrace.c | 1 - arch/microblaze/kernel/ptrace.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/kernel/sys_microblaze.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/mm/hugetlbpage.c | 1 - arch/mn10300/kernel/ptrace.c | 1 - arch/mn10300/kernel/signal.c | 1 - arch/mn10300/kernel/sys_mn10300.c | 1 - arch/mn10300/kernel/traps.c | 1 - arch/mn10300/mm/fault.c | 1 - arch/mn10300/mm/misalignment.c | 1 - arch/powerpc/kernel/ptrace32.c | 1 - arch/s390/kernel/dis.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/mm/fault.c | 1 - arch/sh/mm/tlb-sh3.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/time_64.c | 1 - arch/sparc/kernel/traps_32.c | 1 - drivers/block/DAC960.c | 1 + drivers/block/cciss.c | 1 + drivers/block/loop.c | 1 - drivers/bluetooth/hci_vhci.c | 1 - drivers/char/amiserial.c | 1 + drivers/char/cyclades.c | 1 + drivers/char/epca.c | 1 + drivers/char/isicom.c | 1 + drivers/char/istallion.c | 1 + drivers/char/moxa.c | 1 + drivers/char/mxser.c | 1 + drivers/char/n_hdlc.c | 1 + drivers/char/n_r3964.c | 1 + drivers/char/pty.c | 1 + drivers/char/rio/rio_linux.c | 1 + drivers/char/riscom8.c | 1 + drivers/char/rocket.c | 1 + drivers/char/serial167.c | 1 + drivers/char/specialix.c | 1 + drivers/char/sx.c | 1 + drivers/char/synclink.c | 1 + drivers/char/synclink_gt.c | 1 + drivers/char/synclinkmp.c | 1 + drivers/char/tpm/tpm.c | 1 - drivers/char/tty_ioctl.c | 1 - drivers/char/tty_ldisc.c | 1 - drivers/char/vt.c | 1 + drivers/char/vt_ioctl.c | 1 + drivers/gpio/vr41xx_giu.c | 1 - drivers/hid/usbhid/hid-core.c | 1 - drivers/isdn/hisax/hfc_usb.c | 1 - drivers/isdn/i4l/isdn_tty.c | 1 + drivers/isdn/mISDN/stack.c | 1 + drivers/media/dvb/bt8xx/dst_ca.c | 1 + drivers/media/dvb/dvb-core/dvbdev.h | 1 - drivers/media/dvb/ttpci/av7110.c | 1 - drivers/media/radio/radio-mr800.c | 1 + drivers/media/radio/radio-si470x.c | 1 + drivers/media/video/bt8xx/bttv-driver.c | 1 + drivers/media/video/cx23885/cx23885-417.c | 1 + drivers/media/video/cx23885/cx23885-video.c | 1 + drivers/media/video/cx88/cx88-blackbird.c | 1 + drivers/media/video/cx88/cx88-video.c | 1 + drivers/media/video/dabusb.c | 1 + drivers/media/video/pwc/pwc-if.c | 1 + drivers/media/video/pwc/pwc.h | 1 - drivers/media/video/s2255drv.c | 1 + drivers/media/video/saa5246a.c | 1 - drivers/media/video/saa5249.c | 1 - drivers/media/video/saa7134/saa7134-empress.c | 1 + drivers/media/video/se401.c | 1 + drivers/media/video/stk-webcam.c | 1 + drivers/media/video/stradis.c | 1 + drivers/media/video/stv680.c | 1 + drivers/media/video/usbvideo/vicam.c | 1 + drivers/media/video/usbvision/usbvision-video.c | 1 + drivers/media/video/v4l2-dev.c | 1 - drivers/media/video/zoran/zoran_driver.c | 1 + drivers/misc/sgi-gru/grufile.c | 1 - drivers/misc/sgi-gru/grukservices.c | 1 - drivers/net/irda/irtty-sir.c | 1 - drivers/pci/hotplug/cpci_hotplug_core.c | 1 - drivers/pci/hotplug/cpqphp_ctrl.c | 1 - drivers/pci/hotplug/cpqphp_sysfs.c | 1 + drivers/pci/hotplug/pciehp_ctrl.c | 1 - drivers/pci/syscall.c | 1 - drivers/s390/block/dasd_ioctl.c | 1 + drivers/scsi/qla2xxx/qla_mid.c | 1 - drivers/telephony/ixj.c | 1 + drivers/telephony/phonedev.c | 1 - drivers/usb/class/cdc-wdm.c | 1 - drivers/usb/gadget/amd5536udc.c | 1 - drivers/usb/gadget/langwell_udc.c | 1 - drivers/usb/gadget/s3c2410_udc.c | 1 - drivers/usb/host/r8a66597-hcd.c | 1 - drivers/usb/misc/iowarrior.c | 1 + drivers/usb/misc/rio500.c | 1 + drivers/usb/misc/usblcd.c | 1 + drivers/usb/musb/cppi_dma.h | 1 - drivers/usb/musb/musb_core.h | 1 - drivers/usb/serial/ftdi_sio.c | 1 + drivers/usb/serial/mos7840.c | 1 + drivers/usb/serial/usb-serial.c | 1 + drivers/video/fbmem.c | 1 - fs/adfs/super.c | 1 + fs/afs/super.c | 1 + fs/autofs4/dev-ioctl.c | 1 - fs/bfs/dir.c | 1 - fs/bfs/file.c | 1 - fs/btrfs/compression.c | 1 - fs/btrfs/file.c | 1 - fs/btrfs/inode.c | 1 - fs/btrfs/ioctl.c | 1 - fs/btrfs/super.c | 1 - fs/char_dev.c | 1 - fs/compat.c | 1 - fs/compat_ioctl.c | 1 + fs/exofs/super.c | 1 + fs/ext2/ioctl.c | 1 - fs/ext4/ioctl.c | 1 - fs/fat/dir.c | 1 - fs/fat/namei_msdos.c | 1 - fs/fat/namei_vfat.c | 1 - fs/fcntl.c | 1 - fs/freevxfs/vxfs_super.c | 1 + fs/hfs/super.c | 1 + fs/hfsplus/super.c | 1 + fs/hpfs/dir.c | 1 + fs/hpfs/file.c | 1 + fs/hpfs/hpfs_fn.h | 1 - fs/hpfs/inode.c | 1 + fs/hpfs/namei.c | 1 + fs/jffs2/super.c | 1 + fs/lockd/clntproc.c | 1 + fs/lockd/svc4proc.c | 1 + fs/lockd/svcproc.c | 1 + fs/nfs/delegation.c | 1 + fs/nfs/dir.c | 1 - fs/nfs/file.c | 1 - fs/nfs/inode.c | 1 - fs/nfs/nfs4proc.c | 1 - fs/nfs/read.c | 1 - fs/nfsd/nfsctl.c | 1 - fs/nfsd/nfssvc.c | 1 - fs/nilfs2/dir.c | 1 - fs/ocfs2/ioctl.c | 1 - fs/reiserfs/xattr.c | 1 - fs/squashfs/super.c | 1 + fs/ubifs/ioctl.c | 1 - fs/xfs/linux-2.6/xfs_file.c | 1 - include/linux/crash_dump.h | 1 - include/linux/hardirq.h | 2 ++ include/linux/quotaops.h | 1 - include/linux/sunrpc/xdr.h | 1 - kernel/power/user.c | 1 - kernel/trace/blktrace.c | 1 + kernel/trace/trace.c | 1 + net/appletalk/ddp.c | 1 + net/ipx/af_ipx.c | 1 + net/irda/af_irda.c | 1 + net/irda/irnet/irnet.h | 1 - net/irda/irnet/irnet_ppp.c | 1 + net/sunrpc/clnt.c | 1 - net/sunrpc/sched.c | 1 - net/sunrpc/svc_xprt.c | 1 + net/wanrouter/wanmain.c | 1 + net/x25/af_x25.c | 1 + 173 files changed, 81 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index 1e9ad52c460e..e072041d19f8 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index d76618db50df..6a387eec6b65 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/blackfin/kernel/sys_bfin.c b/arch/blackfin/kernel/sys_bfin.c index a8f1329c15a4..3da60fb13ce4 100644 --- a/arch/blackfin/kernel/sys_bfin.c +++ b/arch/blackfin/kernel/sys_bfin.c @@ -29,7 +29,6 @@ * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include #include #include #include diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c index a79fbd87021b..2ad962c7e88e 100644 --- a/arch/cris/kernel/sys_cris.c +++ b/arch/cris/kernel/sys_cris.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 92c9689b7d97..9daa87fdb018 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index bf0abe9e1f73..98b8feb12ed8 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index b86aa623e36d..53ff39af6a5c 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 493819c25fba..1c80e4fc40ce 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index 8c9ebac5da10..e000bce09b2b 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index c4f9ac17474a..32644b4a0714 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 471c09aa1614..8c2834f5919d 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c index e143339ad28e..cf847dabc1bd 100644 --- a/arch/mn10300/kernel/ptrace.c +++ b/arch/mn10300/kernel/ptrace.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 9f7572a0f578..feb2f2e810db 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index bca5a84dc72c..29d196b83d25 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index 0dfdc5001124..91365adba4f5 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index a62e1e138bc1..53bb17d0f068 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include /* For unblank_screen() */ diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index 94c4a4358065..30016251f658 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 297632cba047..8a6daf4129f6 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index d2f270c995d9..db943a7ec513 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 490b39934d65..43acd73105b7 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 74eb26bf1970..e5e119fe03b2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c index 7fbfd5a11ffa..17cb7c3adf22 100644 --- a/arch/sh/mm/tlb-sh3.c +++ b/arch/sh/mm/tlb-sh3.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index 8ce6285a06d5..7e3dfd9bb97e 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index a941c610e7ce..4ae91dc2feb9 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index 5c12e79b4bdf..da1218e8ee87 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 358283341b47..c0490c7bbde0 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 668dc234b8e2..1e6b7c14f697 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 65a0655e7fc8..a52cc7fe45ea 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 801f4ab83302..5757188cd1fb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index 1df9dda2e377..d5cde6d86f89 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c index 72429b6b2fa8..6c32fbf07164 100644 --- a/drivers/char/amiserial.c +++ b/drivers/char/amiserial.c @@ -81,6 +81,7 @@ static char *serial_version = "4.30"; #include #include #include +#include #include #include diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index f3366d3f06cf..2dafc2da0648 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -633,6 +633,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/epca.c b/drivers/char/epca.c index abef1f7d84fe..ff647ca1c489 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c index 621d1184673c..4f1f4cd670da 100644 --- a/drivers/char/isicom.c +++ b/drivers/char/isicom.c @@ -122,6 +122,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index 0c999f5bb3db..ab2f3349c5c4 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c index 65b6ff2442c6..dd0083bbb64a 100644 --- a/drivers/char/moxa.c +++ b/drivers/char/moxa.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index 52d953eb30c3..dbf8d52f31d0 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c index 1c43c8cdee25..c68118efad84 100644 --- a/drivers/char/n_hdlc.c +++ b/drivers/char/n_hdlc.c @@ -97,6 +97,7 @@ #include #include #include +#include #include /* used in new tty drivers */ #include /* used in new tty drivers */ #include diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c index 2e99158ebb8a..6934025a1ac1 100644 --- a/drivers/char/n_r3964.c +++ b/drivers/char/n_r3964.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include /* used in new tty drivers */ diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 9d1b4f548f67..6e6942c45f5b 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c index ce81da5b2da9..d58c2eb07f07 100644 --- a/drivers/char/rio/rio_linux.c +++ b/drivers/char/rio/rio_linux.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c index 217660451237..171711acf5cd 100644 --- a/drivers/char/riscom8.c +++ b/drivers/char/riscom8.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index 63d5b628477a..0e29a23ec4c5 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c index f1f24f0ee26f..51e7a46787be 100644 --- a/drivers/char/serial167.c +++ b/drivers/char/serial167.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c index e72be4190a44..bfe4cdb2febb 100644 --- a/drivers/char/specialix.c +++ b/drivers/char/specialix.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 518f2a25d91e..a81ec4fcf6ff 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -216,6 +216,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c index afded3a2379c..813552f14884 100644 --- a/drivers/char/synclink.c +++ b/drivers/char/synclink.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c index a2e67e6df3a1..91f20a92fddf 100644 --- a/drivers/char/synclink_gt.c +++ b/drivers/char/synclink_gt.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c index 6f727e3c53ad..8d4a2a8a0a70 100644 --- a/drivers/char/synclinkmp.c +++ b/drivers/char/synclinkmp.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index ccdd828adcef..b0603b2e5684 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -26,7 +26,6 @@ #include #include #include -#include #include "tpm.h" diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b24f6c6a1ea3..ad6ba4ed2808 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index 913aa8d3f1c5..0ef0dc97ba20 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/vt.c b/drivers/char/vt.c index d9113b4c76e3..7947bd1b4cf7 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index 7539bed0f7e0..95189f288f8c 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/drivers/gpio/vr41xx_giu.c b/drivers/gpio/vr41xx_giu.c index b70e06133e78..b16c9a8c03f5 100644 --- a/drivers/gpio/vr41xx_giu.c +++ b/drivers/gpio/vr41xx_giu.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 76c4bbe9dccb..3c1fcb7640ab 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/isdn/hisax/hfc_usb.c b/drivers/isdn/hisax/hfc_usb.c index 8df889b0c1a9..9de54202c90c 100644 --- a/drivers/isdn/hisax/hfc_usb.c +++ b/drivers/isdn/hisax/hfc_usb.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include "hisax.h" diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index b4d4522e5071..2881a66c1aa9 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -13,6 +13,7 @@ #include #include +#include #include "isdn_common.h" #include "isdn_tty.h" #ifdef CONFIG_ISDN_AUDIO diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c index e2f45019ebf0..3e1532a180ff 100644 --- a/drivers/isdn/mISDN/stack.c +++ b/drivers/isdn/mISDN/stack.c @@ -17,6 +17,7 @@ #include #include +#include #include "core.h" static u_int *debug; diff --git a/drivers/media/dvb/bt8xx/dst_ca.c b/drivers/media/dvb/bt8xx/dst_ca.c index 4601b059b2b2..0e246eaad05a 100644 --- a/drivers/media/dvb/bt8xx/dst_ca.c +++ b/drivers/media/dvb/bt8xx/dst_ca.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include "dvbdev.h" diff --git a/drivers/media/dvb/dvb-core/dvbdev.h b/drivers/media/dvb/dvb-core/dvbdev.h index 79927305e84d..487919bea7ae 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.h +++ b/drivers/media/dvb/dvb-core/dvbdev.h @@ -27,7 +27,6 @@ #include #include #include -#include #define DVB_MAJOR 212 diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c index d1d959ed37b7..8d65c652ba50 100644 --- a/drivers/media/dvb/ttpci/av7110.c +++ b/drivers/media/dvb/ttpci/av7110.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c index 837467f93805..575bf9d89419 100644 --- a/drivers/media/radio/radio-mr800.c +++ b/drivers/media/radio/radio-mr800.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/radio/radio-si470x.c b/drivers/media/radio/radio-si470x.c index 46d216329611..e85f318b4d2b 100644 --- a/drivers/media/radio/radio-si470x.c +++ b/drivers/media/radio/radio-si470x.c @@ -127,6 +127,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c index 5eb1464af670..d147d29bb0d3 100644 --- a/drivers/media/video/bt8xx/bttv-driver.c +++ b/drivers/media/video/bt8xx/bttv-driver.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include "bttvp.h" diff --git a/drivers/media/video/cx23885/cx23885-417.c b/drivers/media/video/cx23885/cx23885-417.c index 2943bfd32a94..428f0c45e6b7 100644 --- a/drivers/media/video/cx23885/cx23885-417.c +++ b/drivers/media/video/cx23885/cx23885-417.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c index 70836af3ab48..5d6093336300 100644 --- a/drivers/media/video/cx23885/cx23885-video.c +++ b/drivers/media/video/cx23885/cx23885-video.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c index 44eacfb0d0d6..356d6896da3f 100644 --- a/drivers/media/video/cx88/cx88-blackbird.c +++ b/drivers/media/video/cx88/cx88-blackbird.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c index b12770848c00..2bb54c3ef5cd 100644 --- a/drivers/media/video/cx88/cx88-video.c +++ b/drivers/media/video/cx88/cx88-video.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/dabusb.c b/drivers/media/video/dabusb.c index ec2f45dde164..0664d111085f 100644 --- a/drivers/media/video/dabusb.c +++ b/drivers/media/video/dabusb.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/pwc/pwc-if.c b/drivers/media/video/pwc/pwc-if.c index db25c3034c11..8d17cf613306 100644 --- a/drivers/media/video/pwc/pwc-if.c +++ b/drivers/media/video/pwc/pwc-if.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_USB_PWC_INPUT_EVDEV #include #endif diff --git a/drivers/media/video/pwc/pwc.h b/drivers/media/video/pwc/pwc.h index 0be6f814f539..0b658dee05a4 100644 --- a/drivers/media/video/pwc/pwc.h +++ b/drivers/media/video/pwc/pwc.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/s2255drv.c b/drivers/media/video/s2255drv.c index 6be845ccc7d7..9e3262c0ba37 100644 --- a/drivers/media/video/s2255drv.c +++ b/drivers/media/video/s2255drv.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c index 155804b061e9..b624a4c01fdc 100644 --- a/drivers/media/video/saa5246a.c +++ b/drivers/media/video/saa5246a.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c index 271d6e931b75..12835fb82c95 100644 --- a/drivers/media/video/saa5249.c +++ b/drivers/media/video/saa5249.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c index add1757f8930..296788c3bf0e 100644 --- a/drivers/media/video/saa7134/saa7134-empress.c +++ b/drivers/media/video/saa7134/saa7134-empress.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "saa7134-reg.h" diff --git a/drivers/media/video/se401.c b/drivers/media/video/se401.c index c8f05297d0f0..85ffc2cba039 100644 --- a/drivers/media/video/se401.c +++ b/drivers/media/video/se401.c @@ -31,6 +31,7 @@ static const char version[] = "0.24"; #include #include #include +#include #include #include #include "se401.h" diff --git a/drivers/media/video/stk-webcam.c b/drivers/media/video/stk-webcam.c index 2e5937047278..4d6785e63455 100644 --- a/drivers/media/video/stk-webcam.c +++ b/drivers/media/video/stk-webcam.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c index 0eb313082c97..eaada39c76fd 100644 --- a/drivers/media/video/stradis.c +++ b/drivers/media/video/stradis.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/stv680.c b/drivers/media/video/stv680.c index 75f286f7a2e9..8b4e7dafce7b 100644 --- a/drivers/media/video/stv680.c +++ b/drivers/media/video/stv680.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/usbvideo/vicam.c b/drivers/media/video/usbvideo/vicam.c index 8d73979596f9..45fce39ec9ad 100644 --- a/drivers/media/video/usbvideo/vicam.c +++ b/drivers/media/video/usbvideo/vicam.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c index 90b58914f984..90d9b5c0e9a7 100644 --- a/drivers/media/video/usbvision/usbvision-video.c +++ b/drivers/media/video/usbvision/usbvision-video.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/v4l2-dev.c b/drivers/media/video/v4l2-dev.c index 31eac66411d7..a7f1b69a7dab 100644 --- a/drivers/media/video/v4l2-dev.c +++ b/drivers/media/video/v4l2-dev.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/video/zoran/zoran_driver.c b/drivers/media/video/zoran/zoran_driver.c index 3d7df32a3d87..bcdefb1bcb3d 100644 --- a/drivers/media/video/zoran/zoran_driver.c +++ b/drivers/media/video/zoran/zoran_driver.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index fa2d93a9fb8d..aed609832bc2 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c index eedbf9c32760..79689b10f937 100644 --- a/drivers/misc/sgi-gru/grukservices.c +++ b/drivers/misc/sgi-gru/grukservices.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c index d53aa9582137..20f9bc626688 100644 --- a/drivers/net/irda/irtty-sir.c +++ b/drivers/net/irda/irtty-sir.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index a5b9f6ae507b..d703e73fffa7 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index 2fa47af992a8..0ff689afa757 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/hotplug/cpqphp_sysfs.c b/drivers/pci/hotplug/cpqphp_sysfs.c index 8450f4a6568a..e6089bdb6e5b 100644 --- a/drivers/pci/hotplug/cpqphp_sysfs.c +++ b/drivers/pci/hotplug/cpqphp_sysfs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "cpqphp.h" diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index ff4034502d24..8aab8edf123e 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include "../pci.h" diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index ec22284eed30..e1c1ec540893 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include "pci.h" diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 4ce3f72ee1c1..df918ef27965 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 650bcef08f2a..cd78c501803a 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/drivers/telephony/ixj.c b/drivers/telephony/ixj.c index a913efc69669..40de151f2789 100644 --- a/drivers/telephony/ixj.c +++ b/drivers/telephony/ixj.c @@ -257,6 +257,7 @@ #include /* everything... */ #include /* error codes */ #include +#include #include #include #include diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c index b52cc830c0b4..f3873f650bb4 100644 --- a/drivers/telephony/phonedev.c +++ b/drivers/telephony/phonedev.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 0fe434505ac4..ba589d4ca8bc 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index 826f3adde5d8..77352ccc245e 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/langwell_udc.c b/drivers/usb/gadget/langwell_udc.c index 6829d5961359..a3913519fd58 100644 --- a/drivers/usb/gadget/langwell_udc.c +++ b/drivers/usb/gadget/langwell_udc.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c index 9a2b8920532d..a9b452fe6221 100644 --- a/drivers/usb/gadget/s3c2410_udc.c +++ b/drivers/usb/gadget/s3c2410_udc.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c index 56976cc0352a..e18f74946e68 100644 --- a/drivers/usb/host/r8a66597-hcd.c +++ b/drivers/usb/host/r8a66597-hcd.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 3c5fe5cee05a..90e1a8dedfa9 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c index deb95bb49fd1..d645f3899fe1 100644 --- a/drivers/usb/misc/rio500.c +++ b/drivers/usb/misc/rio500.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c index e0ff9ccd866b..29092b8e59ce 100644 --- a/drivers/usb/misc/usblcd.c +++ b/drivers/usb/misc/usblcd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/musb/cppi_dma.h b/drivers/usb/musb/cppi_dma.h index 8a39de3e6e47..59bf949e589b 100644 --- a/drivers/usb/musb/cppi_dma.h +++ b/drivers/usb/musb/cppi_dma.h @@ -5,7 +5,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h index f3772ca3b2cf..381d648a36b8 100644 --- a/drivers/usb/musb/musb_core.h +++ b/drivers/usb/musb/musb_core.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 5f08702f672f..5a8ae274d258 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index c40f95c1951c..c31940a307f8 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index a84216464ca0..0c39b55aeef4 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 53ea05645ff8..a85c818be945 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/adfs/super.c b/fs/adfs/super.c index aad92f0a1048..6910a98bd73c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "adfs.h" #include "dir_f.h" diff --git a/fs/afs/super.c b/fs/afs/super.c index ad0514d0115f..e1ea1c240b6a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index f3da2eb51f56..00bf8fcb245f 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 54bd07d44e68..1e41aadb1068 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include "bfs.h" diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 6a021265f018..88b9a3ff44e4 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -11,7 +11,6 @@ #include #include -#include #include "bfs.h" #undef DEBUG diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de1e2fd32080..9d8ba4d54a37 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c3cd248d8d6..4b833972273a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7ffa3d34ea19..791eab19e330 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9f4db848db10..bd88f25889f7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f179d4832d5..6d6d06cb6dfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/char_dev.c b/fs/char_dev.c index b7c9d5187a75..a173551e19d7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/compat.c b/fs/compat.c index fbadb947727b..94502dab972a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 626c7483b4de..f28f070a60fc 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/exofs/super.c b/fs/exofs/super.c index a343b4ea62f6..5ab10c3bbebe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -31,6 +31,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include #include #include diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 7cb4badef927..e7431309bdca 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bb415408fdb6..24a6abb2aef5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 38ff75a0fe22..530b4ca01510 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 82f88733b681..bbc94ae4fd77 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "fat.h" /* Characters that are undesirable in an MS-DOS file name */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 73471b7ecc8c..cb6e83557112 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include "fat.h" diff --git a/fs/fcntl.c b/fs/fcntl.c index a040b764f8e3..ae413086db97 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index cdbd1654e4cd..1e8af939b3e4 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6f833dc8e910..f7fcbe49da72 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "hfs_fs.h" diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9fc3af0c0dab..c0759fe0855b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 6916c41d7017..8865c94f55f6 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -6,6 +6,7 @@ * directory VFS functions */ +#include #include "hpfs_fn.h" static int hpfs_dir_release(struct inode *inode, struct file *filp) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 64ab52259204..3efabff00367 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -6,6 +6,7 @@ * file VFS functions */ +#include #include "hpfs_fn.h" #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c2ea31bae313..701ca54c0867 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -13,7 +13,6 @@ #include #include #include -#include #include "hpfs.h" diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 39a1bfbea312..fe703ae46bc7 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -6,6 +6,7 @@ * inode VFS functions */ +#include #include "hpfs_fn.h" void hpfs_init_inode(struct inode *i) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index b649232dde97..82b9c4ba9ed0 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -6,6 +6,7 @@ * adding & removing files & directories */ #include +#include #include "hpfs_fn.h" static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 07a22caf2687..0035c021395a 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index f2fdcbce143e..4336adba952a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 1725037374c5..bd173a6ca3b1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3688e55901fc..e1d28ddd2169 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index af05b918cb5b..6dd48a4405b4 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 89f98e9a024b..38d42c29fb92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0055b813ec2c..05062329b678 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 64f87194d390..bd7938eda6a8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92ce43517814..ff0c080db59b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 96c4ebfa46f4..73ea5e8d66ce 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1250fb978ac1..6d0847562d87 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d4c9884cd54b..492c79b7800b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 54100acc1102..1a4fa04cf071 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -43,7 +43,6 @@ */ #include -#include #include "nilfs.h" #include "page.h" diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 9fcd36dcc9a0..467b413bec21 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,7 +7,6 @@ #include #include -#include #define MLOG_MASK_PREFIX ML_INODE #include diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index f3d47d856848..6925b835a43b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 3b52770f46ff..cb5fc57e370b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 6db7a6be6c97..8aacd64957a2 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -25,7 +25,6 @@ /* This file implements EXT2-compatible extended attribute ioctl() calls */ #include -#include #include #include "ubifs.h" diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4e255441574..0542fd507649 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -41,7 +41,6 @@ #include "xfs_ioctl.h" #include -#include static struct vm_operations_struct xfs_file_vm_ops; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2dac064d8359..0026f267da20 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -3,7 +3,6 @@ #ifdef CONFIG_CRASH_DUMP #include -#include #include #include diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 45257475623c..8246c697863d 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -2,7 +2,9 @@ #define LINUX_HARDIRQ_H #include +#ifdef CONFIG_PREEMPT #include +#endif #include #include #include diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 7bc457593684..26361c4c037a 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -7,7 +7,6 @@ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ -#include #include static inline struct quota_info *sb_dqopt(struct super_block *sb) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index d8910b68e1bd..b99c625fddfe 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * Buffer adjustment diff --git a/kernel/power/user.c b/kernel/power/user.c index ed97375daae9..bf0014d6a5f0 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 39af8af6fc30..1090b0aed9ba 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa8..8bc8d8afea6a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 590b83963622..bfbe13786bb4 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -54,6 +54,7 @@ #include #include #include +#include #include /* For TIOCOUTQ/INQ */ #include #include diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 417b0e309495..f1118d92a191 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index cb762c8723ea..80cf29aae096 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index bccf4d0059f0..b001c361ad30 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -241,7 +241,6 @@ #include #include -#include #include #include #include diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 6d8ae03c14f5..68cbcb19cbd8 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -13,6 +13,7 @@ * 2) as a control channel (write commands, read events) */ +#include #include "irnet_ppp.h" /* Private header */ /* Please put other headers in irnet.h - Thanks */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5bc2f45bddf0..ebfcf9b89909 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1102ce1251f7..8f459abe97cf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6f33d33cc064..27d44332f017 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 466e2d22d256..258daa80ad92 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -48,6 +48,7 @@ #include #include /* support for loadable modules */ #include /* kmalloc(), kfree() */ +#include #include #include /* inline mem*, str* functions */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 21cdc872004e..5e6c072c64d3 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 6 Jul 2009 09:31:33 +0100 Subject: perf_counter: Fix the tracepoint channel to perfcounters Fix a missed rename in EVENT_PROFILE support so that it gets built and allows tracepoint tracing from the 'perf' tool. Fix a typo in the (never before built & enabled) portion in perf_counter.c as well, and update that code to the attr.config changes as well. Signed-off-by: Chris Wilson Cc: Ben Gamari Cc: Jason Baron Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk> Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- kernel/perf_counter.c | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 1ce05a4cb5f6..cb2c09270226 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -962,7 +962,7 @@ config PERF_COUNTERS config EVENT_PROFILE bool "Tracepoint profile sources" - depends on PERF_COUNTERS && EVENT_TRACER + depends on PERF_COUNTERS && EVENT_TRACING default y endmenu diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da2347..c6c38fb7766a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id) { struct perf_sample_data data = { - .regs = get_irq_regs(); + .regs = get_irq_regs(), .addr = 0, }; @@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->attr)); + ftrace_profile_disable(counter->attr.config); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->attr); - int ret; - - ret = ftrace_profile_enable(event_id); - if (ret) + if (ftrace_profile_enable(counter->attr.config)) return NULL; counter->destroy = tp_perf_counter_destroy; -- cgit v1.2.3 From 6ab5d668b131d3c5416f6df1d3ca95b82d4fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Jun 2009 00:55:45 -0400 Subject: tracing/function-profiler: do not free per cpu variable stat The per cpu variable stat is freeded if we fail to allocate a name on start up. This was due to stat at first being allocated in the initial design. But since then, it has become a static per cpu variable but the free on error was not removed. Also added __init annotation to the function that this is in. [ Impact: prevent possible memory corruption on low mem at boot up ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bce9e01a29c8..4521c77d1a1a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -768,7 +768,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -786,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) * The files created are permanent, if something happens * we still do not free memory. */ - kfree(stat); WARN(1, "Could not allocate stat file for cpu %d\n", cpu); @@ -813,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) } #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ -- cgit v1.2.3 From 04aef32d39cc4ef80087c0ce8ed113c6d64f1a6b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 15 Jul 2009 12:29:06 +0800 Subject: tracing/function: Fix the return value of ftrace_trace_onoff_callback() ftrace_trace_onoff_callback() will return an error even if we do the right operation, for example: # echo _spin_*:traceon:10 > set_ftrace_filter -bash: echo: write error: Invalid argument # cat set_ftrace_filter #### all functions enabled #### _spin_trylock_bh:traceon:count=10 _spin_unlock_irq:traceon:count=10 _spin_unlock_bh:traceon:count=10 _spin_lock_irq:traceon:count=10 _spin_unlock:traceon:count=10 _spin_trylock:traceon:count=10 _spin_unlock_irqrestore:traceon:count=10 _spin_lock_irqsave:traceon:count=10 _spin_lock_bh:traceon:count=10 _spin_lock:traceon:count=10 We want to set _spin_*:traceon:10 to set_ftrace_filter, it complains with "Invalid argument", but the operation is successful. This is because ftrace_process_regex() returns the number of functions that matched the pattern. If the number is not 0, this value is returned by ftrace_regex_write() whereas we want to return the number of bytes virtually written. Also the file offset pointer is not updated in this case. If the number of matched functions is lower than the number of bytes written by the user, this results to a reprocessing of the string given by the user with a lower size, leading to a malformed ftrace regex and then a -EINVAL returned. So, this patch fixes it by returning 0 if no error occured. The fix also applies on 2.6.30 Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Cc: stable@kernel.org Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144bff21..75ef000613c3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -363,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) out_reg: ret = register_ftrace_function_probe(glob, ops, count); - return ret; + return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_traceon_cmd = { -- cgit v1.2.3 From 54fdc5816631b43ba55fc3206d7add2d85850bc6 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Thu, 16 Jul 2009 12:32:27 +0200 Subject: sched: Account for vruntime wrapping I spotted two sites that didn't take vruntime wrap-around into account. Fix these by creating a comparison helper that does do so. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7c248dc30f41..9ffb2b2ceba4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->vruntime - cfs_rq->min_vruntime; @@ -1017,7 +1023,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || entity_before(rightmost, se))) return; /* @@ -1713,7 +1719,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && curr->vruntime < se->vruntime) { + curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. -- cgit v1.2.3 From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:15:52 +0200 Subject: perf_counter: Make sure we dont leak kernel memory to userspace There are a few places we are leaking tiny amounts of kernel memory to userspace. This happens when writing out strings because we always align the end to 64 bits. To avoid this we should always use an appropriately sized temporary buffer and ensure it is zeroed. Since d_path assembles the string from the end of the buffer backwards, we need to add 64 bits after the buffer to allow for alignment. We also need to copy arch_vma_name to the temporary buffer, because if we use it directly we may end up copying to userspace a number of bytes after the end of the string constant. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.273972048@samba.org> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c6c38fb7766a..f7a8ab9576e4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; unsigned int size; - char *comm = comm_event->task->comm; + char comm[TASK_COMM_LEN]; + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; const char *name; + memset(tmp, 0, sizeof(tmp)); + if (file) { - buf = kzalloc(PATH_MAX, GFP_KERNEL); + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { - name = arch_vma_name(mmap_event->vma); - if (name) + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); goto got_name; + } if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); -- cgit v1.2.3 From ed900c054b541254f0ce5cedaf75206e29bd614e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: perf_counter: Log vfork as a fork event Right now we don't output vfork events. Even though we should always see an exec after a vfork, we may get perfcounter samples between the vfork and exec. These samples can lead to some confusion when parsing perfcounter data. To keep things consistent we should always log a fork event. It will result in a little more log data, but is less confusing to trace parsing tools. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.589309391@samba.org> Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0aa..4812d60b29f8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1408,14 +1408,11 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else if (!(clone_flags & CLONE_VM)) { - /* - * vfork will do an exec which will call - * set_task_comm() - */ - perf_counter_fork(p); } + if (!(clone_flags & CLONE_THREAD)) + perf_counter_fork(p); + audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); -- cgit v1.2.3 From e5d490b252423605a77c54b2e35b10ea663763df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Jul 2009 12:23:11 +0100 Subject: profile: Suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Linux Memory Management List Cc: Heinz Diehl Cc: David Miller Cc: Arnaldo Carvalho de Melo Cc: Mel Gorman Cc: Andrew Morton LKML-Reference: <1247656992-19846-3-git-send-email-mel@csn.ul.ie> Signed-off-by: Ingo Molnar --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745eb..419250ebec4d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3 From a468d389349a7560249b355cdb6d2097ea1616c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:46 +0200 Subject: sched: fix load average accounting vs. cpu hotplug The new load average code clears rq->calc_load_active on CPU_ONLINE. That's wrong as the new onlined CPU might have got a scheduler tick already and accounted the delta to the stale value of the time we offlined the CPU. Clear the value when we cleanup the dead CPU instead. Also move the update of the calc_load_update time for the newly online CPU to CPU_UP_PREPARE to avoid that the CPU plays catch up with the stale update time value. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98972d366fdc..1b59e265273b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7289,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static void calc_global_load_remove(struct rq *rq) { atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -7515,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) task_rq_unlock(rq, &flags); get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -7525,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); - rq->calc_load_update = calc_load_update; - rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- cgit v1.2.3 From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 3 ++- kernel/freezer.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 16a982e389fb..3ab08e4bb6b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -209,7 +209,7 @@ extern unsigned long long time_sync_thresh; ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FREEZING) == 0) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -1680,6 +1680,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936cf7083..bd1d42b17cb2 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; schedule(); } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } -- cgit v1.2.3 From 4841158b26e28e1476eed84c7347c18f11317750 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Sat, 18 Jul 2009 16:46:02 -0400 Subject: timer: Avoid reading uninitialized data timer->expires may be uninitialized, so check timer_pending() before touching timer->expires to pacify kmemcheck. Signed-off-by: Pavel Roskin LKML-Reference: <20090718204602.5191.360.stgit@mj.roinet.com> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8b..a7f07d5a6241 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) * networking code - if the timer is re-modified * to be the same thing then just return: */ - if (timer->expires == expires && timer_pending(timer)) + if (timer_pending(timer) && timer->expires == expires) return 1; return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -- cgit v1.2.3 From 79ef2bb01445400def20c7993b27fbcad27ca95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jul 2009 17:09:12 +0200 Subject: clocksource: Prevent NULL pointer dereference Writing a zero length string to sys/.../current_clocksource will cause a NULL pointer dereference if the clock events system is in one shot (highres or nohz) mode. Pointed-out-by: Dan Carpenter LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 592bf584d1d2..7466cb811251 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Check to make sure we don't switch to a non-highres capable * clocksource if the tick code is in oneshot mode (highres or nohz) */ - if (tick_oneshot_mode_active() && + if (tick_oneshot_mode_active() && ovr && !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { printk(KERN_WARNING "%s clocksource is not HRT compatible. " "Cannot switch while in HRT/NOHZ mode\n", ovr->name); -- cgit v1.2.3 From 591d2fb02ea80472d846c0b8507007806bdd69cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 11:09:39 +0200 Subject: genirq: Delegate irq affinity setting to the irq thread irq_set_thread_affinity() calls set_cpus_allowed_ptr() which might sleep, but irq_set_thread_affinity() is called with desc->lock held and can be called from hard interrupt context as well. The code has another bug as it does not hold a ref on the task struct as required by set_cpus_allowed_ptr(). Just set the IRQTF_AFFINITY bit in action->thread_flags. The next time the thread runs it migrates itself. Solves all of the above problems nicely. Add kerneldoc to irq_set_thread_affinity() while at it. Signed-off-by: Thomas Gleixner LKML-Reference: --- include/linux/interrupt.h | 2 ++ kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 50 +++++++++++++++++++++++++++++++++++++++++------ kernel/irq/migration.c | 2 +- 4 files changed, 48 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2721f07e9354..88b056ac5629 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -64,11 +64,13 @@ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity */ enum { IRQTF_RUNTHREAD, IRQTF_DIED, IRQTF_WARNED, + IRQTF_AFFINITY, }; typedef irqreturn_t (*irq_handler_t)(int, void *); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 73468253143b..e70ed5592eb9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc); /* * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da67672901..f0de36f13a44 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action = desc->action; while (action) { if (action->thread) - set_cpus_allowed_ptr(action->thread, cpumask); + set_bit(IRQTF_AFFINITY, &action->thread_flags); action = action->next; } } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (desc->status & IRQ_MOVE_PCNTXT) { if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } } else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #else if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } #endif desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) - irq_set_thread_affinity(desc, desc->affinity); + irq_set_thread_affinity(desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -443,6 +451,34 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->affinity); + spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} + /* * Interrupt handler thread */ @@ -458,6 +494,8 @@ static int irq_thread(void *data) while (!irq_wait_for_interrupt(action)) { + irq_thread_check_affinity(desc, action); + atomic_inc(&desc->threads_active); spin_lock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767ca1545..fcb6c96f2627 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq) < nr_cpu_ids)) if (!desc->chip->set_affinity(irq, desc->pending_mask)) { cpumask_copy(desc->affinity, desc->pending_mask); - irq_set_thread_affinity(desc, desc->pending_mask); + irq_set_thread_affinity(desc); } cpumask_clear(desc->pending_mask); -- cgit v1.2.3 From 9ba5f005c994ad28e266a0cd14ef29354be382c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 14:18:35 +0200 Subject: softirq: introduce tasklet_hrtimer infrastructure commit ca109491f (hrtimer: removing all ur callback modes) moved all hrtimer callbacks into hard interrupt context when high resolution timers are active. That breaks code which relied on the assumption that the callback happens in softirq context. Provide a generic infrastructure which combines tasklets and hrtimers together to provide an in-softirq hrtimer experience. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: kaber@trash.net Cc: David Miller LKML-Reference: <1248265724.27058.1366.camel@twins> Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 26 +++++++++++++++++++ kernel/softirq.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2721f07e9354..fd4c9c63c757 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -517,6 +518,31 @@ extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +struct tasklet_hrtimer { + struct hrtimer timer; + struct tasklet_struct tasklet; + enum hrtimer_restart (*function)(struct hrtimer *); +}; + +extern void +tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode); + +static inline +int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) +{ + return hrtimer_start(&ttimer->timer, time, mode); +} + +static inline +void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer) +{ + hrtimer_cancel(&ttimer->timer); + tasklet_kill(&ttimer->tasklet); +} + /* * Autoprobing for irqs: * diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905fa5d2..eb5e131a0485 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) softirq_vec[nr].action = action; } -/* Tasklets */ +/* + * Tasklets + */ struct tasklet_head { struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + if (hrtimer_is_hres_active(timer)) { + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; + } + return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); EXPORT_PER_CPU_SYMBOL(softirq_work_list); -- cgit v1.2.3 From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: Fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5c6fae4f43d8..ff854fd89a81 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 11:13:50 +0200 Subject: perf_counter: Plug more stack leaks Per example of Arjan's patch, I went through and found a few more. Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff854fd89a81..e1d6a3aa1333 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task) .event = { .header = { .type = PERF_EVENT_FORK, + .misc = 0, .size = sizeof(fork_event.event), }, + /* .pid */ + /* .ppid */ }, }; @@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task) comm_event = (struct perf_comm_event){ .task = task, + /* .comm */ + /* .comm_size */ .event = { - .header = { .type = PERF_EVENT_COMM, }, + .header = { + .type = PERF_EVENT_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ }, }; @@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma) mmap_event = (struct perf_mmap_event){ .vma = vma, + /* .file_name */ + /* .file_size */ .event = { - .header = { .type = PERF_EVENT_MMAP, }, + .header = { + .type = PERF_EVENT_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = vma->vm_pgoff, -- cgit v1.2.3 From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:40 +0200 Subject: perf_counter: PERF_SAMPLE_ID and inherited counters Anton noted that for inherited counters the counter-id as provided by PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID because each inherited counter gets its own id. His suggestion was to always return the parent counter id, since that is the primary counter id as exposed. However, these inherited counters have a unique identifier so that events like PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which counter gets modified, which is important when trying to normalize the sample streams. This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD, which is more useful anyway, since changing periods became a lot more common than initially thought -- rendering PERF_EVENT_PERIOD the less useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate value, since it reports the value used to trigger the overflow, whereas PERF_EVENT_PERIOD simply reports the requested period changed, which might only take effect on the next cycle). This still leaves us PERF_EVENT_THROTTLE to consider, but since that _should_ be a rare occurrence, and linking it to a primary id is the most useful bit to diagnose the problem, we introduce a PERF_SAMPLE_STREAM_ID, for those few cases where the full reconstruction is important. [Does change the ABI a little, but I see no other way out] Suggested-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <1248095846.15751.8781.camel@twins> --- include/linux/perf_counter.h | 15 ++----- kernel/perf_counter.c | 92 +++++++++++++++---------------------------- tools/perf/builtin-annotate.c | 24 ----------- tools/perf/builtin-report.c | 24 ----------- 4 files changed, 35 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5e970c7d3fd5..bd15d7a5f5ce 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -120,8 +120,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_MAX = 1U << 9, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ }; /* @@ -312,16 +313,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 time; * u64 id; - * u64 sample_period; - * }; - */ - PERF_EVENT_PERIOD = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; + * u64 stream_id; * }; */ PERF_EVENT_THROTTLE = 5, @@ -356,6 +348,7 @@ enum perf_event_type { * { u64 time; } && PERF_SAMPLE_TIME * { u64 addr; } && PERF_SAMPLE_ADDR * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e1d6a3aa1333..7530588fa5c5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx) } } +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_period(struct perf_counter *counter, u64 events) { @@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) if (!sample_period) sample_period = 1; - perf_log_period(counter, sample_period); - hwc->sample_period = sample_period; } @@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = counter->id; + values[n++] = primary_counter_id(counter); mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { - perf_log_period(counter, value); - counter->attr.sample_period = value; counter->hw.sample_period = value; } @@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); @@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, data->addr); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); + + perf_output_put(&handle, id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) @@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sub != counter) sub->pmu->read(sub); - group_entry.id = sub->id; + group_entry.id = primary_counter_id(sub); group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter, } if (counter->attr.read_format & PERF_FORMAT_ID) { - u64 id; - event.header.size += sizeof(u64); - if (counter->parent) - id = counter->parent->id; - else - id = counter->id; - - event.format[i++] = id; + event.format[i++] = primary_counter_id(counter); } ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); @@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma) perf_counter_mmap_event(&mmap_event); } -/* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - struct freq_event event; - int ret; - - if (counter->hw.sample_period == period) - return; - - if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) - return; - - event = (struct freq_event) { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(event), - }, - .time = sched_clock(), - .id = counter->id, - .period = period, - }; - - ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_end(&handle); -} - /* * IRQ throttle logging */ @@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct perf_event_header header; u64 time; u64 id; + u64 stream_id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), - .id = counter->id, + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 5f9eefecc574..1dba568e1941 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -74,20 +74,12 @@ struct fork_event { u32 pid, ppid; }; -struct period_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 sample_period; -}; - typedef union event_union { struct perf_event_header header; struct ip_event ip; struct mmap_event mmap; struct comm_event comm; struct fork_event fork; - struct period_event period; } event_t; @@ -997,19 +989,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static int -process_period_event(event_t *event, unsigned long offset, unsigned long head) -{ - dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", - (void *)(offset + head), - (void *)(long)(event->header.size), - event->period.time, - event->period.id, - event->period.sample_period); - - return 0; -} - static int process_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1025,9 +1004,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) case PERF_EVENT_FORK: return process_fork_event(event, offset, head); - - case PERF_EVENT_PERIOD: - return process_period_event(event, offset, head); /* * We dont process them right now but they are fine: */ diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index a118bc77286d..b20a4b6e31b7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -101,13 +101,6 @@ struct fork_event { u32 pid, ppid; }; -struct period_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 sample_period; -}; - struct lost_event { struct perf_event_header header; u64 id; @@ -127,7 +120,6 @@ typedef union event_union { struct mmap_event mmap; struct comm_event comm; struct fork_event fork; - struct period_event period; struct lost_event lost; struct read_event read; } event_t; @@ -1635,19 +1627,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static int -process_period_event(event_t *event, unsigned long offset, unsigned long head) -{ - dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", - (void *)(offset + head), - (void *)(long)(event->header.size), - event->period.time, - event->period.id, - event->period.sample_period); - - return 0; -} - static int process_lost_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1729,9 +1708,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) case PERF_EVENT_FORK: return process_fork_event(event, offset, head); - case PERF_EVENT_PERIOD: - return process_period_event(event, offset, head); - case PERF_EVENT_LOST: return process_lost_event(event, offset, head); -- cgit v1.2.3 From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 22 Jul 2009 23:05:46 +1000 Subject: perf_counter: Fix throttle/unthrottle event logging Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE). Fix this to print both a throttle and unthrottle event. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090722130546.GE9029@kryten> --- kernel/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7530588fa5c5..787d4daef185 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE + 1, + .type = PERF_EVENT_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, @@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .stream_id = counter->id, }; + if (enable) + throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; -- cgit v1.2.3 From 0dc3d523e8bc4718e0be2e4a742367d6e4be77cd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a641eb753b8c..7bc888dfd06a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2665,6 +2665,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 61f3826133dc07142935fb5712fc738e19eb5575 Mon Sep 17 00:00:00 2001 From: Bruno Premont Date: Wed, 22 Jul 2009 22:22:32 +0200 Subject: genirq: Fix UP compile failure caused by irq_thread_check_affinity Since genirq: Delegate irq affinity setting to the irq thread (591d2fb02ea80472d846c0b8507007806bdd69cc) compilation with CONFIG_SMP=n fails with following error: /usr/src/linux-2.6/kernel/irq/manage.c: In function 'irq_thread_check_affinity': /usr/src/linux-2.6/kernel/irq/manage.c:475: error: 'struct irq_desc' has no member named 'affinity' make[4]: *** [kernel/irq/manage.o] Error 1 That commit adds a new function irq_thread_check_affinity() which uses struct irq_desc.affinity which is only available for CONFIG_SMP=y. Move that function under #ifdef CONFIG_SMP. [ tglx@brownpaperbag: compile and boot tested on UP and SMP ] Signed-off-by: Bruno Premont LKML-Reference: <20090722222232.2eb3e1c4@neptune.home> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0de36f13a44..61c679db4687 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -451,6 +451,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +#ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ @@ -478,6 +479,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif /* * Interrupt handler thread -- cgit v1.2.3 From d8cc1ab793993c886c62abf77c93287df33ffd8b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:28:40 +0800 Subject: trace_stack: Fix seqfile memory leak Every time we cat stack_trace, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D8E8.3020500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af910124..6a2a9d484cd6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &stack_trace_seq_ops); - - return ret; + return seq_open(file, &stack_trace_seq_ops); } static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; int -- cgit v1.2.3 From 87827111a5538633b18e5c641ced673c4c2bb6ce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:11 +0800 Subject: function-graph: Fix seqfile memory leak Every time we cat set_graph_function, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D907.2010500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4521c77d1a1a..1f3ec2afa511 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2595,6 +2595,14 @@ ftrace_graph_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { @@ -2724,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 636eacee3b0c76915151db37203cc624becb6d7b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:47 +0800 Subject: tracing/stat: Fix seqfile memory leak Every time we cat a trace_stat file, we leak memory allocated by seq_open(). Also fix memory leak in a failure path in tracing_stat_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D92B.4060704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e493342..aea321c82fa0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node) } } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session) { struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session) session->stat_root = RB_ROOT; } +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); - reset_stat_session(session); + __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session) int i; mutex_lock(&session->stat_mutex); - reset_stat_session(session); + __reset_stat_session(session); if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit: return ret; exit_free_rbtree: - reset_stat_session(session); + __reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = { static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - + struct seq_file *m; struct stat_session *session = inode->i_private; + ret = stat_seq_init(session); + if (ret) + return ret; + ret = seq_open(file, &trace_stat_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = session; - ret = stat_seq_init(session); + if (ret) { + reset_stat_session(session); + return ret; } + m = file->private_data; + m->private = session; return ret; } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f) { struct stat_session *session = i->i_private; - mutex_lock(&session->stat_mutex); reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return 0; + return seq_release(i, f); } static const struct file_operations tracing_stat_fops = { -- cgit v1.2.3 From 4c739ff043e5787d97c9691d62cabf7a29e75a9d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:11:03 -0400 Subject: tracing: show proper address for trace-printk format Since the trace_printk may use pointers to the format fields in the buffer, they are exported via debugfs/tracing/printk_formats. This is used by utilities that read the ring buffer in binary format. It helps the utilities map the address of the format in the binary buffer to what the printf format looks like. Unfortunately, the way the output code works, it exports the address of the pointer to the format address, and not the format address itself. This makes the file totally useless in trying to figure out what format string a binary address belongs to. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b6278110827..687699d365ae 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; - seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* * Tabs and new lines need to be converted. -- cgit v1.2.3 From 8650ae32ef7045e763825dee6256dde7f331bb85 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:29:30 -0400 Subject: tracing: only truncate ftrace files when O_TRUNC is set The current code will truncate the ftrace files contents if O_APPEND is not set and the file is opened in write mode. This is incorrect. It should only truncate the file if O_TRUNC is set. Otherwise if one of these files is opened by a C program with fopen "r+", it will incorrectly truncate the file. Reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f3ec2afa511..1e1d23c26308 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { @@ -2577,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6a..d8ef28574aa1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2031,7 +2031,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; if (cpu == TRACE_PIPE_ALL_CPU) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a88..23d2972b22d6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_clear_events(); seq_ops = inode->i_private; -- cgit v1.2.3 From bdff78707f3ce47e891f3201c9666122a70556ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2009 15:30:45 -0400 Subject: trace: stop tracer in oops_enter() If trace_printk_on_oops is set we lose interesting trace information when the tracer is enabled across oops handling and printing. We want the trace which might give us information _WHY_ we oopsed. Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ecbd72c..512ab73b0ca3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void) */ void oops_enter(void) { + tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); -- cgit v1.2.3 From 6560dc160f3a96b8f1f43e2c6b51aa6eb9898b90 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 23 Jul 2009 23:42:08 +0930 Subject: module: use MODULE_SYMBOL_PREFIX with module_layout The check_modstruct_version() needs to look up the symbol "module_layout" in the kernel, but it does so literally and not by a C identifier. The trouble is that it does not include a symbol prefix for those ports that need it (like the Blackfin and H8300 port). So make sure we tack on the MODULE_SYMBOL_PREFIX define to the front of it. Signed-off-by: Mike Frysinger Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0a049837008e..fd1411403558 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - if (!find_symbol("module_layout", NULL, &crc, true, false)) + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) BUG(); return check_version(sechdrs, versindex, "module_layout", mod, crc); } -- cgit v1.2.3 From 9ae260270c90643156cda73427aa1f04c923e627 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jun 2009 02:51:13 +0200 Subject: update the comment in kthread_stop() Commit 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 ("kthreads: rework kthread_stop()") removed the limitation that the thread function mysr not call do_exit() itself, but forgot to update the comment. Since that commit it is OK to use kthread_stop() even if kthread can exit itself. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/kthread.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de26979..eb8751aa0418 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind); * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. -- cgit v1.2.3 From 38ceb592fcac9110c6b3c87ea0a27bff68c43486 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:11:24 +0800 Subject: tracing: Fix invalid function_graph entry When print_graph_entry() computes a function call entry event, it needs to also check the next entry to guess if it matches the return event of the current function entry. In order to look at this next event, it needs to consume the current entry before going ahead in the ring buffer. However, if the current event that gets consumed is the last one in the ring buffer head page, the ring_buffer may reuse the page for writers. The consumed entry will then become invalid because of possible racy overwriting. Me must then handle this entry by making a copy of it. The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEAEC.3050508@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb53..420ec3487579 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter) switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *field, saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; -- cgit v1.2.3 From 74e7ff8c50b6b022e6ffaa736b16a4dc161d3eaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:17:22 +0800 Subject: tracing: Fix missing function_graph events when we splice_read from trace_pipe About a half events are missing when we splice_read from trace_pipe. They are unexpectedly consumed because we ignore the TRACE_TYPE_NO_CONSUME return value used by the function graph tracer when it needs to consume the events by itself to walk on the ring buffer. The same problem appears with ftrace_dump() Example of an output before this patch: 1) | ktime_get_real() { 1) 2.846 us | read_hpet(); 1) 4.558 us | } 1) 6.195 us | } After this patch: 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 1.960 us | read_hpet(); 0) 3.597 us | } 0) 5.196 us | } The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEC52.90704@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6a..da984ad065ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3085,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; if (!find_next_entry_inc(iter)) { rem = 0; @@ -4233,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing) iter.pos = -1; if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); -- cgit v1.2.3 From 933b787b57ca8bdc0fc8fb2cbf67b5e6d21beb84 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 29 Jul 2009 15:02:07 -0700 Subject: mm: copy over oom_adj value at fork time Fix a post-2.6.31 regression which was introduced by 2ff05b2b4eac2e63d345fc731ea151a060247f53 ("oom: move oom_adj value from task_struct to mm_struct"). After moving the oom_adj value from the task struct to the mm_struct, the oom_adj value was no longer properly inherited by child processes. Copying over the oom_adj value at fork time fixes that bug. [kosaki.motohiro@jp.fujitsu.com: test for current->mm before dereferencing it] Signed-off-by: Rik van Riel Reported-by: Paul Menage Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9b42695f0d14..29b532e718f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.2.3 From 11c7da4b0ca76a57f51c996c883c480e203cf5a9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 29 Jul 2009 15:02:08 -0700 Subject: kexec: fix omitting offset in extended crashkernel syntax Setting "crashkernel=512M-2G:64M,2G-:128M" does not work but it turns to work if it has a trailing-whitespace, like "crashkernel=512M-2G:64M,2G-:128M ". It was because of a bug in the parser, running over the cmdline. This patch adds a check of the termination. Reported-by: Jin Dongming Signed-off-by: Hidetoshi Seto Tested-by: Jin Dongming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c35201cc8..f336e2107f98 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline, } while (*cur++ == ','); if (*crash_size > 0) { - while (*cur != ' ' && *cur != '@') + while (*cur && *cur != ' ' && *cur != '@') cur++; if (*cur == '@') { cur++; -- cgit v1.2.3 From 096b7fe012d66ed55e98bc8022405ede0cc80e96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Jul 2009 15:04:04 -0700 Subject: cgroups: fix pid namespace bug The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844 ("cgroups: convert tasks file to use a seq_file with shared pid array"). We cache a pid array for all threads that are opening the same "tasks" file, but the pids in the array are always from the namespace of the last process that opened the file, so all other threads will read pids from that namespace instead of their own namespaces. To fix it, we maintain a list of pid arrays, which is keyed by pid_ns. The list will be of length 1 at most time. Reported-by: Paul Menage Idea-by: Paul Menage Signed-off-by: Li Zefan Reviewed-by: Serge Hallyn Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 11 +++--- kernel/cgroup.c | 96 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 665fa70e4094..20411d2876f8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -179,14 +179,11 @@ struct cgroup { */ struct list_head release_list; - /* pids_mutex protects the fields below */ + /* pids_mutex protects pids_list and cached pid arrays. */ struct rw_semaphore pids_mutex; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the current tasks_pids array */ - int pids_use_count; - /* Length of the current tasks_pids array */ - int pids_length; + + /* Linked list of struct cgroup_pids */ + struct list_head pids_list; /* For RCU-protected deletion */ struct rcu_head rcu_head; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf5..250dac05680f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pids_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2201,12 +2203,30 @@ err: return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } - /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { - int end = cgrp->pids_length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (cgrp->tasks_pids[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (cgrp->tasks_pids[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cgrp->pids_length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cgrp->tasks_pids + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; int *p = v; - int *end = cgrp->tasks_pids + cgrp->pids_length; + int *end = cp->tasks_pids + cp->length; /* * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2308,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { .show = cgroup_tasks_show, }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { + struct cgroup *cgrp = cp->cgrp; + down_write(&cgrp->pids_mutex); - BUG_ON(!cgrp->pids_use_count); - if (!--cgrp->pids_use_count) { - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = NULL; - cgrp->pids_length = 0; + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct seq_file *seq; + struct cgroup_pids *cp; if (!(file->f_mode & FMODE_READ)) return 0; - release_cgroup_pid_array(cgrp); + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } @@ -2324,6 +2353,8 @@ static struct file_operations cgroup_tasks_operations = { static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; pid_t *pidarray; int npids; int retval; @@ -2350,20 +2381,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * array if necessary */ down_write(&cgrp->pids_mutex); - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = pidarray; - cgrp->pids_length = npids; - cgrp->pids_use_count++; + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - release_cgroup_pid_array(cgrp); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = cgrp; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -- cgit v1.2.3 From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 17 ++++++++++++++++ kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++---------------- mm/memcontrol.c | 23 ++++++++++++++++++--- 3 files changed, 75 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 20411d2876f8..90bba9e62286 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,6 +362,23 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if cgrp is a descendant of the task's cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); +/* + * When the subsys has to access css and may add permanent refcnt to css, + * it should take care of racy conditions with rmdir(). Following set of + * functions, is for stop/restart rmdir if necessary. + * Because these will call css_get/put, "css" should be alive css. + * + * cgroup_exclude_rmdir(); + * ...do some jobs which may access arbitrary empty cgroup + * cgroup_release_and_wakeup_rmdir(); + * + * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, + * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. + */ + +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); + /* * Control Group subsystem type. * See Documentation/cgroups/cgroups.txt for details diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680f..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * reference to css->refcnt. In general, this refcnt is expected to goes down * to zero, soon. * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) wake_up_all(&cgroup_rmdir_waitq); } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * wake up rmdir() waiter. the rmdir should fail since the cgroup * is no longer empty. */ - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); return 0; } @@ -2743,34 +2755,43 @@ again: } mutex_unlock(&cgroup_mutex); + /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ ret = cgroup_call_pre_destroy(cgrp); - if (ret) + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); return ret; + } mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - /* - * css_put/get is provided for subsys to grab refcnt to css. In typical - * case, subsystem has no reference after pre_destroy(). But, under - * hierarchy management, some *temporal* refcnt can be hold. - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys - * is really busy, it should return -EBUSY at pre_destroy(). wake_up - * is called when css_put() is called and refcnt goes down to 0. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); - schedule(); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); if (signal_pending(current)) @@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964cb5a0..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* -- cgit v1.2.3 From b62f495dad04fa94b5083aec638ff3072bccaaca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:09 -0700 Subject: profile: suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745eb..419250ebec4d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3 From ec30c5f3a18722f8fcf8c83146a10b03ac4d9ff1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2009 19:47:23 -0400 Subject: kprobes: Use kernel_text_address() for checking probe address Use kernel_text_address() for checking probe address instead of __kernel_text_address(), because __kernel_text_address() returns true for init functions even after relaseing those functions. That will hit a BUG() in text_poke(). Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516a..0540948e29ab 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -694,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p) p->addr = addr; preempt_disable(); - if (!__kernel_text_address((unsigned long) p->addr) || + if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { preempt_enable(); return -EINVAL; -- cgit v1.2.3 From 0083fc2c50e6c5127c2802ad323adf8143ab7856 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 10:34:56 -0700 Subject: do_sigaltstack: avoid copying 'stack_t' as a structure to user space Ulrich Drepper correctly points out that there is generally padding in the structure on 64-bit hosts, and that copying the structure from kernel to user space can leak information from the kernel stack in those padding bytes. Avoid the whole issue by just copying the three members one by one instead, which also means that the function also can avoid the need for a stack frame. This also happens to match how we copy the new structure from user space, so it all even makes sense. [ The obvious solution of adding a memset() generates horrid code, gcc does really stupid things. ] Reported-by: Ulrich Drepper Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ccf1ceedaebe..f268372c0cc0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s stack_t oss; int error; - if (uoss) { - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - } + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); if (uss) { void __user *ss_sp; @@ -2501,13 +2499,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_size = ss_size; } + error = 0; if (uoss) { error = -EFAULT; - if (copy_to_user(uoss, &oss, sizeof(oss))) + if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) goto out; + error = __put_user(oss.ss_sp, &uoss->ss_sp) | + __put_user(oss.ss_size, &uoss->ss_size) | + __put_user(oss.ss_flags, &uoss->ss_flags); } - error = 0; out: return error; } -- cgit v1.2.3 From 0dd8486b5cfe8048e0613334659d9252ecd1b08a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 11:18:56 -0700 Subject: do_sigaltstack: small cleanups The previous commit ("do_sigaltstack: avoid copying 'stack_t' as a structure to user space") fixed a real bug. This one just cleans up the copy from user space to that gcc can generate better code for it (and so that it looks the same as the later copy back to user space). Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f268372c0cc0..64c5deeaca5d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2464,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s int ss_flags; error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) - || __get_user(ss_sp, &uss->ss_sp) - || __get_user(ss_flags, &uss->ss_flags) - || __get_user(ss_size, &uss->ss_size)) + if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) + goto out; + error = __get_user(ss_sp, &uss->ss_sp) | + __get_user(ss_flags, &uss->ss_flags) | + __get_user(ss_size, &uss->ss_size); + if (error) goto out; error = -EPERM; -- cgit v1.2.3 From e53c0994709166b111fbe9162d1a16ece7dfc45b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 14:42:10 +0200 Subject: perf_counter: Collapse inherit on read() Currently the counter value returned by read() is the value of the parent counter, to which child counters are only fed back on child exit. Thus read() can return rather erratic (and meaningless) numbers depending on the state of the child processes. Change this by always iterating the full child hierarchy on read() and sum all counters. Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 950931041954..48471d75ae01 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1688,6 +1688,18 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +static u64 perf_counter_read_tree(struct perf_counter *counter) +{ + struct perf_counter *child; + u64 total = 0; + + total += perf_counter_read(counter); + list_for_each_entry(child, &counter->child_list, child_list) + total += perf_counter_read(child); + + return total; +} + /* * Read the performance counter - simple non blocking version for now */ @@ -1707,7 +1719,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read(counter); + values[0] = perf_counter_read_tree(counter); n = 1; if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + -- cgit v1.2.3 From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++- kernel/fork.c | 4 +- kernel/perf_counter.c | 87 +++++++++++++++++++++++++++++--------------- 3 files changed, 71 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index bd15d7a5f5ce..e604e6ef72dd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -181,8 +181,9 @@ struct perf_counter_attr { freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ - __reserved_1 : 51; + __reserved_1 : 50; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -308,6 +309,15 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_EXIT = 4, + /* * struct { * struct perf_event_header header; @@ -323,6 +333,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, ppid; + * u32 tid, ptid; * }; */ PERF_EVENT_FORK = 7, diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f7..466531eb92cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d75ae01..199ed4771315 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, int new) { - struct perf_fork_event fork_event; + struct perf_task_event task_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) return; - fork_event = (struct perf_fork_event){ + task_event = (struct perf_task_event){ .task = task, .event = { .header = { - .type = PERF_EVENT_FORK, + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, - .size = sizeof(fork_event.event), + .size = sizeof(task_event.event), }, /* .pid */ /* .ppid */ + /* .tid */ + /* .ptid */ }, }; - perf_counter_fork_event(&fork_event); + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: -- cgit v1.2.3 From e414314cce7539788dd5d2c35decad11782dd858 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Fix latencytop and sleep profiling vs group scheduling The latencytop and sleep accounting code assumes that any scheduler entity represents a task, this is not so. Cc: Arjan van de Ven Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9ffb2b2ceba4..652e8bdef9aa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - account_scheduler_latency(tsk, delta >> 10, 1); + if (tsk) + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; - /* - * Blocking time is in units of nanosecs, so shift by 20 to - * get a milliseconds-range estimation of the amount of - * time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - - profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), - delta >> 20); + if (tsk) { + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); } - account_scheduler_latency(tsk, delta >> 10, 0); } #endif } -- cgit v1.2.3 From 07903af152b0597d94e9b0030746b63c4664e787 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:28 -0400 Subject: sched: Fix race in cpupri introduced by cpumask_var changes Background: Several race conditions in the scheduler have cropped up recently, which Steven and I have tracked down using ftrace. The most recent one turns out to be a race in how the scheduler determines a suitable migration target for RT tasks, introduced recently with commit: commit 68e74568fbe5854952355e942acca51f138096d9 Date: Tue Nov 25 02:35:13 2008 +1030 sched: convert struct cpupri_vec cpumask_var_t. The original design of cpupri allowed lockless readers to quickly determine a best-estimate target. Races between the pri_active bitmap and the vec->mask were handled in the original code because we would detect and return "0" when this occured. The design was predicated on the *effective* atomicity (*) of caching the result of cpus_and() between the cpus_allowed and the vec->mask. Commit 68e74568 changed the behavior such that vec->mask is accessed multiple times. This introduces a subtle race, the result of which means we can have a result that returns "1", but with an empty bitmap. *) yes, we know cpus_and() is not a locked operator across the entire composite array, but it is implicitly atomic on a per-word basis which is all the design required to work. Implementation: Rather than forgoing the lockless design, or reverting to a stack-based cpumask_t, we simply check for when the race has been encountered and continue processing in the event that the race is hit. This renders the removal race as if the priority bit had been atomically cleared as well, and allows the algorithm to execute correctly. Signed-off-by: Gregory Haskins CC: Rusty Russell CC: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145728.25226.92769.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c251790dde..d014efbf947a 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - if (lowest_mask) + if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + return 1; } -- cgit v1.2.3 From 70d715fd0597f18528f389b5ac59102263067744 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 3 Aug 2009 11:48:19 +0900 Subject: posix-timers: Fix oops in clock_nanosleep() with CLOCK_MONOTONIC_RAW Prevent calling do_nanosleep() with clockid CLOCK_MONOTONIC_RAW, it may cause oops, such as NULL pointer dereference. Signed-off-by: Hiroshi Shimamoto Cc: Andrew Morton Cc: Thomas Gleixner Cc: John Stultz Cc: LKML-Reference: <4A764FF3.50607@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/posix-timers.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d195c7..d089d052c4a9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } +static int no_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return -EOPNOTSUPP; +} + /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.2.3 From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..2fd1752f0c85 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit v1.2.3 From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752f0c85..2606cee433da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit v1.2.3 From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 12:05:21 -0700 Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock() In the event of a lock steal or owner died, rt_mutex_start_proxy_lock() will give the rt_mutex to the waiting task, but it fails to release the wait_lock. This leads to subsequent deadlocks when other tasks try to acquire the rt_mutex. I also removed a few extra blank lines that really spaced this routine out. I must have been high on the \n when I wrote this originally... Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A79D7F1.4000405@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a78c5a..29bd4baf9e75 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); - + spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { /* * Reset the return value. We might have -- cgit v1.2.3 From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 8 +++----- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655b..d7cd193c2277 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -119,11 +119,9 @@ struct ftrace_event_call { void *filter; void *mod; -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); }; #define MAX_FILTER_PRED 32 diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecfe..11ba5bb4ed0a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972b22d6..e75276a49cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.2.3 From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee433da..d4d3580a894a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit v1.2.3 From 1054598cab8674438675085fae459e960eb10799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 18:06:26 +0200 Subject: perf_counter: Fix double list iteration in per task precise stats Brice Goglin reported this crash with per task precise stats: > I finally managed to test the threaded perfcounter statistics (thanks a > lot for implementing it). I am running 2.6.31-rc5 (with the AMD > magny-cours patches but I don't think they matter here). I am trying to > measure local/remote memory accesses per thread during the well-known > stream benchmark. It's compiled with OpenMP using 16 threads on a > quad-socket quad-core barcelona machine. > > Command line is: > /mnt/scratch/bgoglin/cpunode/linux-2.6.31/tools/perf/perf record -f -s > -e r1000001e0 -e r1000002e0 -e r1000004e0 -e r1000008e0 ./stream > > It seems to work fine with a single -e on the command line > while it crashes when there are at least 2 of them. > It seems to work fine without -s as well. A silly copy-paste resulted in a messed up iteration which would cause the OOPS. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin LKML-Reference: <1249574786.32113.550.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 199ed4771315..673c1aaf7332 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx, __perf_counter_sync_stat(counter, next_counter); counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(next_counter, event_entry); } } -- cgit v1.2.3 From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 16:37:10 +0800 Subject: lockdep: Fix file mode of lock_stat /proc/lock_stat is writable. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c4..e94caa666dba 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void) &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); #endif return 0; -- cgit v1.2.3 From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580a894a..a330513d96ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit v1.2.3 From 69dd647f969c28d18de77e2153f30d05a1874571 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 6 Aug 2009 15:07:29 -0700 Subject: generic-ipi: fix hotplug_cfd() Use CONFIG_HOTPLUG_CPU, not CONFIG_CPU_HOTPLUG When hot-unpluging a cpu, it will leak memory allocated at cpu hotplug, but only if CPUMASK_OFFSTACK=y, which is default to n. The bug was introduced by 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"). Signed-off-by: Xiao Guangrong Cc: Ingo Molnar Cc: Jens Axboe Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rusty Russell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d8501207..94188b8ecc33 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_BAD; break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: -- cgit v1.2.3 From 9c8a8228d0827e0d91d28527209988f672f97d28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Aug 2009 15:09:28 -0700 Subject: execve: must clear current->clear_child_tid While looking at Jens Rosenboom bug report (http://lkml.org/lkml/2009/7/27/35) about strange sys_futex call done from a dying "ps" program, we found following problem. clone() syscall has special support for TID of created threads. This support includes two features. One (CLONE_CHILD_SETTID) is to set an integer into user memory with the TID value. One (CLONE_CHILD_CLEARTID) is to clear this same integer once the created thread dies. The integer location is a user provided pointer, provided at clone() time. kernel keeps this pointer value into current->clear_child_tid. At execve() time, we should make sure kernel doesnt keep this user provided pointer, as full user memory is replaced by a new one. As glibc fork() actually uses clone() syscall with CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID set, chances are high that we might corrupt user memory in forked processes. Following sequence could happen: 1) bash (or any program) starts a new process, by a fork() call that glibc maps to a clone( ... CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID ...) syscall 2) When new process starts, its current->clear_child_tid is set to a location that has a meaning only in bash (or initial program) context (&THREAD_SELF->tid) 3) This new process does the execve() syscall to start a new program. current->clear_child_tid is left unchanged (a non NULL value) 4) If this new program creates some threads, and initial thread exits, kernel will attempt to clear the integer pointed by current->clear_child_tid from mm_release() : if (tsk->clear_child_tid && !(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ << here >> put_user(0, tidptr); sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } 5) OR : if new program is not multi-threaded, but spied by /proc/pid users (ps command for example), mm_users > 1, and the exiting program could corrupt 4 bytes in a persistent memory area (shm or memory mapped file) If current->clear_child_tid points to a writeable portion of memory of the new program, kernel happily and silently corrupts 4 bytes of memory, with unexpected effects. Fix is straightforward and should not break any sane program. Reported-by: Jens Rosenboom Acked-by: Linus Torvalds Signed-off-by: Eric Dumazet Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Sonny Rao Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ulrich Drepper Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 466531eb92cc..021e1138556e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ - if (tsk->clear_child_tid - && !(tsk->flags & PF_SIGNALED) - && atomic_read(&mm->mm_users) > 1) { - u32 __user * tidptr = tsk->clear_child_tid; + if (tsk->clear_child_tid) { + if (!(tsk->flags & PF_SIGNALED) && + atomic_read(&mm->mm_users) > 1) { + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tsk->clear_child_tid); + sys_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0); + } tsk->clear_child_tid = NULL; - - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tidptr); - sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } } -- cgit v1.2.3 From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf2..3fd30197da2e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit v1.2.3 From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:09 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. I noticed the same problem also existed for the create_pred() case and added a fix for that. Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746549.6453.29.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf46..1557148be34b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); @@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); -- cgit v1.2.3 From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:53 -0500 Subject: tracing/filters: Always free pred on filter_add_subsystem_pred() failure If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM, the pred doesn't get freed, while as a side effect it does for other errors. Make it so the caller always frees the pred for any error. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746593.6453.32.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1557148be34b..f32dc9d1ea7b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, return -ENOSPC; } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, } replace_filter_string(call->filter, filter_string); } + + filter->preds[filter->n_preds] = pred; + filter->n_preds++; out: return err; } @@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; @@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; -- cgit v1.2.3 From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 6 Aug 2009 16:03:30 -0700 Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer() When the process exits we don't have to run new cputimer nor use running one (as it not accounts when tsk->exit_state != 0) to get process CPU times. As there is only one thread we can just use CPU times fields from task and signal structs. Signed-off-by: Stanislaw Gruszka Cc: Peter Zijlstra Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b2..e33a21cb9407 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct task_cputime cputime; + struct signal_struct *const sig = tsk->signal; - thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, - cputime.utime, cputime.stime, cputime.sum_exec_runtime); + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -- cgit v1.2.3 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 110 ++++++++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 6 +-- 2 files changed, 88 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e5..fec71f8dbc48 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aaf7332..52eb4b68d34f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 18 +++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 -- tools/perf/builtin-record.c | 1 + 7 files changed, 126 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c2277..a81170de7f6b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72dd..a67dd5c5b6d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc48..7fb16d90e7b1 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34f..868102172aa4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8c..c22b40f8f576 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc780..8b9f4f6e9559 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da09928130f..90c98082af10 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v1.2.3 From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 868102172aa4..615440ab9295 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; +/* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + /* * Software counter: cpu wall time clock */ -- cgit v1.2.3 From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440ab9295..117622cb73a3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit v1.2.3 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d3..2aabe43c1d04 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a3..002310540417 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.2.3 From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 002310540417..546e62d62941 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit v1.2.3 From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f159..8cc3ee1363a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d04..a9d823a93fe8 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b1..7167b9b97da2 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d62941..5229d1666fa5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.2.3 From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d1666fa5..b0b20a07f394 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit v1.2.3 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- kernel/wait.c | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4ca..cf3c2f5dba51 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275cf..c4bd3d825f35 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.2.3 From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee29..235716556bf1 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v1.2.3 From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1363a0..e18cfbdc7190 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the -- cgit v1.2.3 From 39cbb602b543e477df71dca84b5b2e36f8bd29fc Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 7 Aug 2009 12:01:08 -0400 Subject: Remove double removal of blktrace directory commit fd51d251e4cdb21f68e9dbc4336514d64a105a79 Author: Stefan Raspl Date: Tue May 19 09:59:08 2009 +0200 blktrace: remove debugfs entries on bad path added in an explicit invocation of debugfs_remove for bt->dir, in blk_remove_buf_file_callback we are also getting the directory removed. On occasion I am seeing memory corruption that I have bisected down to this commit. [The testing involves a (long) series of I/O benchmarks with blktrace invoked around the actual runs.] I believe that this committed patch is correct, but the problem actually lies in the code in blk_remove_buf_file_callback. With this patch I am able to consistently get complete runs whereas previously I could not get a single run to complete. The first part of the patch simply moves the debugfs_remove below the relay_close: the relay_close call will remove files under bt->dir, and so we should not remove the directory until all the files we created have been removed. (Note: This is not sufficient to fix the problem - the file system code has ref counts on the directoy, so our invocation does not cause the directory to actually be removed. Nonetheless, we should not rely upon that feature.) Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0aed9ba..7a34cb563fec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -267,8 +267,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -378,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } -- cgit v1.2.3 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe8..2b36afe6ae0f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a07f394..e26d2fcfa320 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3 From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 09:51:55 +0200 Subject: perf_counter: Fix swcounter context invariance perf_swcounter_is_counting() uses a lock, which means we cannot use swcounters from NMI or when holding that particular lock, this is unintended. The below removes the lock, this opens up race window, but not worse than the swcounters already experience due to RCU traversal of the context in perf_swcounter_ctx_event(). This also fixes the hard lockups while opening a lockdep tracepoint counter. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Paul Mackerras Cc: stephane eranian Cc: Corey J Ashford LKML-Reference: <1250149915.10001.66.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e26d2fcfa320..3dd4339589a0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, static int perf_swcounter_is_counting(struct perf_counter *counter) { - struct perf_counter_context *ctx; - unsigned long flags; - int count; - + /* + * The counter is active, we're good! + */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) return 1; + /* + * The counter is off/error, not counting. + */ if (counter->state != PERF_COUNTER_STATE_INACTIVE) return 0; /* - * If the counter is inactive, it could be just because - * its task is scheduled out, or because it's in a group - * which could not go on the PMU. We want to count in - * the first case but not the second. If the context is - * currently active then an inactive software counter must - * be the second case. If it's not currently active then - * we need to know whether the counter was active when the - * context was last active, which we can determine by - * comparing counter->tstamp_stopped with ctx->time. - * - * We are within an RCU read-side critical section, - * which protects the existence of *ctx. + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. */ - ctx = counter->ctx; - spin_lock_irqsave(&ctx->lock, flags); - count = 1; - /* Re-check state now we have the lock */ - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->ctx->is_active || - counter->tstamp_stopped < ctx->time) - count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); - return count; + if (counter->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 ++++++-- kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++------------ 2 files changed, 238 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0f..b53f7006cc4e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339589a0..b8c6b97a20a3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v1.2.3 From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:54 +0200 Subject: perf_counter: Fix an ipi-deadlock perf_pending_counter() is called from IRQ context and will call perf_counter_disable(), however perf_counter_disable() uses smp_call_function_single() which doesn't fancy being used with IRQs disabled due to IPI deadlocks. Fix this by making it use the local __perf_counter_disable() call and teaching the counter_sched_out() code about pending disables as well. This should cover the case where a counter migrates before the pending queue gets processed. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.244097721@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b8c6b97a20a3..3f841beefc7b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { -- cgit v1.2.3 From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 16:14:42 +0200 Subject: perf_counter: Report the cloning task as parent on perf_counter_fork() A bug in (9f498cc: perf_counter: Full task tracing) makes profiling multi-threaded apps it go belly up. [ output as: (PID:TID):(PPID:PTID) ] # ./perf report -D | grep FORK 0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236) 0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236) 0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236) 0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236) 0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236) Shows us that the test (27d028d perf report: Update for the new FORK/EXIT events) in builtin-report.c: /* * A thread clone will have the same PID for both * parent and child. */ if (thread == parent) return 0; Will clearly fail. The problem is that perf_counter_fork() reports the actual parent, instead of the cloning thread. Fixing that (with the below patch), yields: # ./perf report -D | grep FORK 0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589) 0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590) 0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590) 0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590) 0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590) Which both makes more sense and doesn't confuse perf report anymore. Reported-by: Pekka Enberg Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: Anton Blanchard Cc: Arjan van de Ven LKML-Reference: <1250172882.5241.62.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f841beefc7b..534e20d14d63 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter, return; task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.ppid = perf_counter_pid(counter, current); task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + task_event->event.ptid = perf_counter_tid(counter, current); perf_output_put(&handle, task_event->event); perf_output_end(&handle); -- cgit v1.2.3 From 2d860ad76f4ee4d2eba0fe3797c8d7cdce432cc0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2009 13:05:10 -0700 Subject: genirq: prevent wakeup of freed irq thread free_irq() can remove an irqaction while the corresponding interrupt is in progress, but free_irq() sets action->thread to NULL unconditionally, which might lead to a NULL pointer dereference in handle_IRQ_event() when the hard interrupt context tries to wake up the handler thread. Prevent this by moving the thread stop after synchronize_irq(). No need to set action->thread to NULL either as action is going to be freed anyway. This fixes a boot crash reported against preempt-rt which uses the mainline irq threads code to implement full irq threading. [ tglx: removed local irqthread variable ] Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c679db4687..d222515a5a06 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -761,7 +761,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; - struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -809,9 +808,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - irqthread = action->thread; - action->thread = NULL; - spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -819,12 +815,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); - if (irqthread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(irqthread); - put_task_struct(irqthread); - } - #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -840,6 +830,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + return action; } -- cgit v1.2.3