From 4a99854c5840065e7d3a464523cbe1993acb4f00 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 28 Feb 2014 14:30:45 -0500 Subject: audit: __audit_syscall_entry: ignore arch arg and call syscall_get_arch() directly Since every arch should have syscall_get_arch() defined, stop using the function argument and just collect this ourselves. We do not drop the argument as fixing some code paths (in assembly) to not pass this first argument is non-trivial. The argument will be dropped when that is fixed. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21eae3c05ec0..dff2a2325655 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1536,7 +1536,7 @@ void __audit_syscall_entry(int arch, int major, if (!audit_enabled) return; - context->arch = arch; + context->arch = syscall_get_arch(); context->major = major; context->argv[0] = a1; context->argv[1] = a2; -- cgit v1.2.3 From 84db564aad45774ab64375ee019d5e7a42675b1f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 29 Jan 2014 16:17:58 -0500 Subject: audit: add arch field to seccomp event log The AUDIT_SECCOMP record looks something like this: type=SECCOMP msg=audit(1373478171.953:32775): auid=4325 uid=4325 gid=4325 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0 pid=12381 comm="test" sig=31 syscall=231 compat=0 ip=0x39ea8bca89 code=0x0 In order to determine what syscall 231 maps to, we need to have the arch= field right before it. To see the event, compile this test.c program: ===== int main(void) { return seccomp_load(seccomp_init(SCMP_ACT_KILL)); } ===== gcc -g test.c -o test -lseccomp After running the program, find the record by: ausearch --start recent -m SECCOMP -i Signed-off-by: Richard Guy Briggs signed-off-by: Eric Paris --- kernel/auditsc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dff2a2325655..9f03ac205e1f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -2488,11 +2489,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); - audit_log_format(ab, " syscall=%ld", syscall); - audit_log_format(ab, " compat=%d", is_compat_task()); - audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); - audit_log_format(ab, " code=0x%x", code); + audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", + signr, syscall_get_arch(), syscall, is_compat_task(), + KSTK_EIP(current), code); audit_log_end(ab); } -- cgit v1.2.3 From b4f0d3755c5e9cc86292d5fd78261903b4f23d4a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 4 Mar 2014 10:38:06 -0500 Subject: audit: x86: drop arch from __audit_syscall_entry() interface Since the arch is found locally in __audit_syscall_entry(), there is no need to pass it in as a parameter. Delete it from the parameter list. x86* was the only arch to call __audit_syscall_entry() directly and did so from assembly code. Signed-off-by: Richard Guy Briggs Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Eric Paris --- As this patch relies on changes in the audit tree, I think it appropriate to send it through my tree rather than the x86 tree. --- arch/x86/ia32/ia32entry.S | 12 ++++++------ arch/x86/kernel/entry_32.S | 11 +++++------ arch/x86/kernel/entry_64.S | 11 +++++------ include/linux/audit.h | 5 ++--- kernel/auditsc.c | 6 ++---- 5 files changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4299eb05023c..f5bdd2881815 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -186,12 +186,12 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi,%r9d /* 6th arg: 4th syscall arg */ - movl %edx,%r8d /* 5th arg: 3rd syscall arg */ - /* (already in %ecx) 4th arg: 2nd syscall arg */ - movl %ebx,%edx /* 3rd arg: 1st syscall arg */ - movl %eax,%esi /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0d0c9d4ab6d5..f9e3fabc8716 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -449,12 +449,11 @@ sysenter_audit: jnz syscall_trace_entry addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 - /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ - /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ - /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ - movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ - movl %eax,%edx /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + movl %esi,4(%esp) /* 5th arg: 4th syscall arg */ + movl %edx,(%esp) /* 4th arg: 3rd syscall arg */ + /* %ecx already in %ecx 3rd arg: 2nd syscall arg */ + movl %ebx,%edx /* 2nd arg: 1st syscall arg */ + /* %eax already in %eax 1st arg: syscall number */ call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c844f0816ab8..5e8cb2ad9fb3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -488,12 +488,11 @@ badsys: * jump back to the normal fast path. */ auditsys: - movq %r10,%r9 /* 6th arg: 4th syscall arg */ - movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ - movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ - movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ - movq %rax,%rsi /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + movq %r10,%r8 /* 5th arg: 4th syscall arg */ + movq %rdx,%rcx /* 4th arg: 3rd syscall arg */ + movq %rsi,%rdx /* 3rd arg: 2nd syscall arg */ + movq %rdi,%rsi /* 2nd arg: 1st syscall arg */ + movq %rax,%rdi /* 1st arg: syscall number */ call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/include/linux/audit.h b/include/linux/audit.h index 783157b289e8..1ae00891aff9 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -115,8 +115,7 @@ extern void audit_log_session_info(struct audit_buffer *ab); /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); -extern void __audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, +extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); @@ -148,7 +147,7 @@ static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a3) { if (unlikely(current->audit_context)) - __audit_syscall_entry(syscall_get_arch(), major, a0, a1, a2, a3); + __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9f03ac205e1f..4e17443fd1ef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1506,7 +1506,6 @@ void __audit_free(struct task_struct *tsk) /** * audit_syscall_entry - fill in an audit record at syscall entry - * @arch: architecture type * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1521,9 +1520,8 @@ void __audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void __audit_syscall_entry(int arch, int major, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4) +void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) { struct task_struct *tsk = current; struct audit_context *context = tsk->audit_context; -- cgit v1.2.3 From c0a8d9b0692cced5b0701ed501012e28b224d32b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 26 May 2014 10:59:28 -0400 Subject: audit: reduce scope of audit_net_id audit_net_id isn't used outside kernel/audit.c. Reduce its scope. Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e797e8..9a951e67a89e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -int audit_net_id; +static int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -- cgit v1.2.3 From 691e6d59d2b6cdb4595e5f626503a1c9e98b8baf Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 26 May 2014 11:02:48 -0400 Subject: audit: reduce scope of audit_log_fcaps audit_log_fcaps() isn't used outside kernel/audit.c. Reduce its scope. Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- kernel/audit.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 9a951e67a89e..de991950091f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1681,7 +1681,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) } } -void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { kernel_cap_t *perm = &name->fcap.permitted; kernel_cap_t *inh = &name->fcap.inheritable; diff --git a/kernel/audit.h b/kernel/audit.h index 7bb65730c890..3cdffad5a1d9 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name, const struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); -extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); extern void audit_log_name(struct audit_context *context, struct audit_names *n, struct path *path, int record_num, int *call_panic); -- cgit v1.2.3 From 6eed9b261334932c742458edd64b7b9fd0b981a9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 3 Jun 2014 22:05:10 +0200 Subject: kernel/audit.c: use ARRAY_SIZE instead of sizeof/sizeof[0] Use kernel.h definition. Cc: Eric Paris Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index de991950091f..8a82d481393d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -750,7 +750,7 @@ static int audit_set_feature(struct sk_buff *skb) struct audit_features *uaf; int i; - BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ -- cgit v1.2.3 From 01478d7d60f654419ba863856cad0446bcb73a59 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 13 Jun 2014 18:22:00 -0400 Subject: audit: use atomic_t to simplify audit_serial() Since there is already a primitive to do this operation in the atomic_t, use it to simplify audit_serial(). Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8a82d481393d..7aef7cbd7bcf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1301,19 +1301,9 @@ err: */ unsigned int audit_serial(void) { - static DEFINE_SPINLOCK(serial_lock); - static unsigned int serial = 0; + static atomic_t serial = ATOMIC_INIT(0); - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&serial_lock, flags); - do { - ret = ++serial; - } while (unlikely(!ret)); - spin_unlock_irqrestore(&serial_lock, flags); - - return ret; + return atomic_add_return(1, &serial); } static inline void audit_get_stamp(struct audit_context *ctx, -- cgit v1.2.3 From e7df61f4d1ddb7fdd654dde6cd40f7cc398c3932 Mon Sep 17 00:00:00 2001 From: Burn Alting Date: Fri, 4 Apr 2014 16:00:38 +1100 Subject: audit: invalid op= values for rules Various audit events dealing with adding, removing and updating rules result in invalid values set for the op keys which result in embedded spaces in op= values. The invalid values are op="add rule" set in kernel/auditfilter.c op="remove rule" set in kernel/auditfilter.c op="remove rule" set in kernel/audit_tree.c op="updated rules" set in kernel/audit_watch.c op="remove rule" set in kernel/audit_watch.c Replace the space in the above values with an underscore character ('_'). Coded-by: Burn Alting Signed-off-by: Richard Guy Briggs --- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 4 ++-- kernel/auditfilter.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 135944a7b28a..bd418c486e9a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -457,7 +457,7 @@ static void audit_log_remove_rule(struct audit_krule *rule) if (unlikely(!ab)) return; audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); + audit_log_string(ab, "remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 70b4554d2fbe..ad9c1682f616 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent, &nentry->rule.list); } - audit_watch_log_rule_change(r, owatch, "updated rules"); + audit_watch_log_rule_change(r, owatch, "updated_rules"); call_rcu(&oentry->rcu, audit_free_rule_rcu); } @@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); + audit_watch_log_rule_change(r, w, "remove_rule"); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..b65a138250b8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1060,7 +1060,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change("add rule", &entry->rule, !err); + audit_log_rule_change("add_rule", &entry->rule, !err); if (err) audit_free_rule(entry); break; @@ -1070,7 +1070,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change("remove rule", &entry->rule, !err); + audit_log_rule_change("remove_rule", &entry->rule, !err); audit_free_rule(entry); break; default: -- cgit v1.2.3 From 219ca39427bf6c46c4e1473493e33bc00635e99b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 26 Mar 2014 07:26:47 -0400 Subject: audit: use union for audit_field values since they are mutually exclusive Since only one of val, uid, gid and lsm* are used at any given time, combine them to reduce the size of the struct audit_field. Signed-off-by: Richard Guy Briggs --- include/linux/audit.h | 14 +++++++++----- kernel/auditfilter.c | 29 ++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 1ae00891aff9..36dffeccebdb 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -66,12 +66,16 @@ struct audit_krule { struct audit_field { u32 type; - u32 val; - kuid_t uid; - kgid_t gid; + union { + u32 val; + kuid_t uid; + kgid_t gid; + struct { + char *lsm_str; + void *lsm_rule; + }; + }; u32 op; - char *lsm_str; - void *lsm_rule; }; extern int is_audit_feature_set(int which); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b65a138250b8..40ed9813d4b2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); +static void audit_free_lsm_field(struct audit_field *f) +{ + switch (f->type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); + } +} + static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e) if (erule->watch) audit_put_watch(erule->watch); if (erule->fields) - for (i = 0; i < erule->field_count; i++) { - struct audit_field *f = &erule->fields[i]; - kfree(f->lsm_str); - security_audit_rule_free(f->lsm_rule); - } + for (i = 0; i < erule->field_count; i++) + audit_free_lsm_field(&erule->fields[i]); kfree(erule->fields); kfree(erule->filterkey); kfree(e); @@ -422,10 +437,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - f->lsm_str = NULL; - f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { -- cgit v1.2.3 From 54e05eddbe507d54f1df18c2680d4f614af9e133 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 21 Aug 2014 13:40:41 -0400 Subject: audit: set nlmsg_len for multicast messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Report: Looking at your example code in http://people.redhat.com/rbriggs/audit-multicast-listen/audit-multicast-listen.c, it seems that nlmsg_len field in the received messages is supposed to contain the length of the header + payload, but it is always set to the size of the header only, i.e. 16. The example program works, because the printf format specifies the minimum width, not "precision", so it simply prints out the payload until the first zero byte. This isn't too much of a problem, but precludes the use of recvmmsg, iiuc? (gdb) p *(struct nlmsghdr*)nlh $14 = {nlmsg_len = 16, nlmsg_type = 1100, nlmsg_flags = 0, nlmsg_seq = 0, nlmsg_pid = 9910} The only time nlmsg_len would have been updated was at audit_buffer_alloc() inside audit_log_start() and never updated after. It should arguably be done in audit_log_vformat(), but would be more efficient in audit_log_end(). Reported-by: Zbigniew Jędrzejewski-Szmek Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7aef7cbd7bcf..d20f00ff7bb5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1949,6 +1949,7 @@ void audit_log_end(struct audit_buffer *ab) } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_len = ab->skb->len; kauditd_send_multicast_skb(ab->skb); /* @@ -1960,7 +1961,7 @@ void audit_log_end(struct audit_buffer *ab) * protocol between the kaudit kernel subsystem and the auditd * userspace code. */ - nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; + nlh->nlmsg_len -= NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); -- cgit v1.2.3 From 9ef91514774a140e468f99d73d7593521e6d25dc Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sun, 24 Aug 2014 20:37:52 -0400 Subject: audit: correct AUDIT_GET_FEATURE return message type When an AUDIT_GET_FEATURE message is sent from userspace to the kernel, it should reply with a message tagged as an AUDIT_GET_FEATURE type with a struct audit_feature. The current reply is a message tagged as an AUDIT_GET type with a struct audit_feature. This appears to have been a cut-and-paste-eo in commit b0fed40. Reported-by: Steve Grubb Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d20f00ff7bb5..3a80abb6eaa1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -724,7 +724,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } -- cgit v1.2.3 From f874738e8c178b19479f7b143211a1df00367988 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 15 Sep 2014 16:17:37 -0400 Subject: audit: remove open_arg() function that is never used open_arg() was added in commit 55669bfa "audit: AUDIT_PERM support" and never used. Remove it. Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4e17443fd1ef..63a74a703c97 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -126,14 +126,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -static inline int open_arg(int flags, int mask) -{ - int n = ACC_MODE(flags); - if (flags & (O_TRUNC | O_CREAT)) - n |= AUDIT_PERM_WRITE; - return n & mask; -} - static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; -- cgit v1.2.3 From 9eab339b197a6903043d272295dcb716ff739b21 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sat, 15 Mar 2014 18:42:34 -0400 Subject: audit: get comm using lock to avoid race in string printing When task->comm is passed directly to audit_log_untrustedstring() without getting a copy or using the task_lock, there is a race that could happen that would output a NULL (\0) in the output string that would effectively truncate the rest of the report text after the comm= field in the audit, losing fields. Use get_task_comm() to get a copy while acquiring the task_lock to prevent this and to prevent the result from being a mixture of old and new values of comm. Signed-off-by: Tetsuo Handa Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 5 ++--- kernel/auditsc.c | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3a80abb6eaa1..53bb39bf79e2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1850,7 +1850,7 @@ EXPORT_SYMBOL(audit_log_task_context); void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; - char name[sizeof(tsk->comm)]; + char comm[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; char *tty; @@ -1884,9 +1884,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->fsgid), tty, audit_get_sessionid(tsk)); - get_task_comm(name, tsk); audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); + audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); if (mm) { down_read(&mm->mmap_sem); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 63a74a703c97..89335723fb2a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2424,6 +2424,7 @@ static void audit_log_task(struct audit_buffer *ab) kgid_t gid; unsigned int sessionid; struct mm_struct *mm = current->mm; + char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); @@ -2436,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab) sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); - audit_log_untrustedstring(ab, current->comm); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); if (mm) { down_read(&mm->mmap_sem); if (mm->exe_file) -- cgit v1.2.3 From 7990da71ebfa887ae6fe4464ab0d99ddeb8efacc Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 3 Sep 2014 17:49:32 +0200 Subject: PM / QoS: Add PM_QOS_MEMORY_BANDWIDTH class Also adds a class type PM_QOS_SUM that aggregates the values by summing them. It can be used by memory controllers to calculate the optimum clock frequency based on the bandwidth needs of the different memory clients. Signed-off-by: Tomeu Vizoso Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.txt | 4 +++- include/linux/pm_qos.h | 5 ++++- kernel/power/qos.c | 27 ++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); -- cgit v1.2.3 From 3639f17068ed40e4e208a6e218481d49817bbd56 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:18 -0400 Subject: audit: put rule existence check in canonical order Use same rule existence check order as audit_make_tree(), audit_to_watch(), update_lsm_rule() for legibility. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 40ed9813d4b2..4a11697cf5b8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -163,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree || + krule->inode_f || krule->watch || krule->tree || (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; -- cgit v1.2.3 From 739c95038e68d364b01c0fc6f8fb8e47b1c1e979 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 10 Oct 2014 15:05:21 -0400 Subject: audit: WARN if audit_rule_change called illegally Signed-off-by: Eric Paris --- kernel/auditfilter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4a11697cf5b8..4419d1fbcad1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1085,7 +1085,8 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, audit_free_rule(entry); break; default: - return -EINVAL; + err = -EINVAL; + WARN_ON(1); } return err; -- cgit v1.2.3 From e85322d21cfebeac64f58a204e9adc0bc5c1e46f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:19 -0400 Subject: audit: cull redundancy in audit_rule_change Re-factor audit_rule_change() to reduce the amount of code redundancy and simplify the logic. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4419d1fbcad1..d214cd073a58 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1064,31 +1064,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, int err = 0; struct audit_entry *entry; + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + switch (type) { case AUDIT_ADD_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); - if (err) - audit_free_rule(entry); break; case AUDIT_DEL_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); - audit_free_rule(entry); break; default: err = -EINVAL; WARN_ON(1); } + if (err || type == AUDIT_DEL_RULE) + audit_free_rule(entry); + return err; } -- cgit v1.2.3 From 2991dd2b0117e864f394c826af6df144206ce0db Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:24 -0400 Subject: audit: rename audit_log_remove_rule to disambiguate for trees Rename audit_log_remove_rule() to audit_tree_log_remove_rule() to avoid confusion with watch and mark rule removal/changes. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index bd418c486e9a..e242e3a9864a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -449,7 +449,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; @@ -476,7 +476,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_log_remove_rule(rule); + audit_tree_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); -- cgit v1.2.3 From 51fae6da640edf9d266c94f36bc806c63c301991 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:12 +0200 Subject: freezer: Do not freeze tasks killed by OOM killer Since f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) OOM killer relies on being able to thaw a frozen task to handle OOM situation but a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) has reorganized the code and stopped clearing freeze flag in __thaw_task. This means that the target task only wakes up and goes into the fridge again because the freezing condition hasn't changed for it. This reintroduces the bug fixed by f660daac474c6f. Fix the issue by checking for TIF_MEMDIE thread flag in freezing_slow_path and exclude the task from freezing completely. If a task was already frozen it would get woken by __thaw_task from OOM killer and get out of freezer after rechecking freezing(). Changes since v1 - put TIF_MEMDIE check into freezing_slowpath rather than in __refrigerator as per Oleg - return __thaw_task into oom_scan_process_thread because oom_kill_process will not wake task in the fridge because it is sleeping uninterruptible [mhocko@suse.cz: rewrote the changelog] Fixes: a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) Cc: 3.3+ # 3.3+ Signed-off-by: Cong Wang Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..8f9279b9c6d7 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; -- cgit v1.2.3 From c05eb32f472fb9f7f474c20ff6fa5bfe0cbedc05 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:13 +0200 Subject: freezer: remove obsolete comments in __thaw_task() __thaw_task() no longer clears frozen flag since commit a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE). Reviewed-by: Michal Hocko Signed-off-by: Cong Wang Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 8f9279b9c6d7..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -150,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); -- cgit v1.2.3 From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit v1.2.3 From a28e785a9f794ba32e603570ab52a262cf963489 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Oct 2014 09:27:15 +0200 Subject: PM: convert do_each_thread to for_each_process_thread as per 0c740d0afc3b (introduce for_each_thread() to replace the buggy while_each_thread()) get rid of do_each_thread { } while_each_thread() construct and replace it by a more error prone for_each_thread. This patch doesn't introduce any user visible change. Suggested-by: Oleg Nesterov Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5cc588c1abab..7f0d4343af1b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -229,11 +229,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -256,10 +256,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); -- cgit v1.2.3 From 71be2114a5474a76edad95343d89b8731457fccd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Oct 2014 22:47:32 +0200 Subject: PM / freezer: Clean up code after recent fixes Clean up the code in process.c after recent changes to get rid of unnecessary labels and goto statements. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 7f0d4343af1b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,25 +108,27 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + /* * Returns true if all freezable tasks (except for current) are frozen already */ static bool check_frozen_processes(void) { - struct task_struct *g, *p; - bool ret = true; + bool ret; read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) && - !frozen(p)) { - ret = false; - goto done; - } - } -done: + ret = __check_frozen_processes(); read_unlock(&tasklist_lock); - return ret; } @@ -167,15 +169,14 @@ int freeze_processes(void) * on the way out so we have to double check for race. */ if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { + !check_frozen_processes()) { __usermodehelper_set_disable_depth(UMH_ENABLED); printk("OOM in progress."); error = -EBUSY; - goto done; + } else { + printk("done."); } - printk("done."); } -done: printk("\n"); BUG_ON(in_atomic()); -- cgit v1.2.3 From b2c4623dcd07af4b8ae3b56ae5f879e281c7b4f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Oct 2014 10:00:05 -0700 Subject: rcu: More on deadlock between CPU hotplug and expedited grace periods Commit dd56af42bd82 (rcu: Eliminate deadlock between CPU hotplug and expedited grace periods) was incomplete. Although it did eliminate deadlocks involving synchronize_sched_expedited()'s acquisition of cpu_hotplug.lock via get_online_cpus(), it did nothing about the similar deadlock involving acquisition of this same lock via put_online_cpus(). This deadlock became apparent with testing involving hibernation. This commit therefore changes put_online_cpus() acquisition of this lock to be conditional, and increments a new cpu_hotplug.puts_pending field in case of acquisition failure. Then cpu_hotplug_begin() checks for this new field being non-zero, and applies any changes to cpu_hotplug.refcount. Reported-by: Jiri Kosina Signed-off-by: Paul E. McKenney Tested-by: Jiri Kosina Tested-by: Borislav Petkov --- kernel/cpu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 356450f09c1f..90a3d017b90c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -113,7 +115,11 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -155,6 +161,12 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + if (atomic_read(&cpu_hotplug.puts_pending)) { + int delta; + + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); -- cgit v1.2.3 From 8252ecf346474cfe46315bd0a7ca655c293c34a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:56:01 -0400 Subject: ftrace: Set ops->old_hash on modifying what an ops hooks to The code that checks for trampolines when modifying function hooks tests against a modified ops "old_hash". But the ops old_hash pointer is not being updated before the changes are made, making it possible to not find the right hash to the callback and possibly causing ftrace to break in accounting and disable itself. Have the ops set its old_hash before the modifying takes place. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb186b9ddf51..483b8c1b1de0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2293,10 +2293,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } -static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3340,7 +3343,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3351,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3477,13 +3481,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(old_hash); + if (!ret) free_ftrace_hash_rcu(old_hash); else count = ret; - __enable_ftrace_function_probe(); - out_unlock: mutex_unlock(&ftrace_lock); out: @@ -3764,10 +3769,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3813,7 +3819,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops); + ftrace_ops_update_code(ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4058,7 +4064,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops); + ftrace_ops_update_code(iter->ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); -- cgit v1.2.3 From 4fc409048d5afb1ad853f294b4262ecf2c980a49 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:48:35 -0400 Subject: ftrace: Fix checking of trampoline ftrace_ops in finding trampoline When modifying code, ftrace has several checks to make sure things are being done correctly. One of them is to make sure any code it modifies is exactly what it expects it to be before it modifies it. In order to do so with the new trampoline logic, it must be able to find out what trampoline a function is hooked to in order to see if the code that hooks to it is what's expected. The logic to find the trampoline from a record (accounting descriptor for a function that is hooked) needs to only look at the "old_hash" of an ops that is being modified. The old_hash is the list of function an ops is hooked to before its update. Since a record would only be pointing to an ops that is being modified if it was already hooked before. Currently, it can pick a modified ops based on its new functions it will be hooked to, and this picks the wrong trampoline and causes the check to fail, disabling ftrace. Signed-off-by: Steven Rostedt ftrace: squash into ordering of ops for modification --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 483b8c1b1de0..31c90fec4158 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) * when we are adding another op to the rec or removing the * current one. Thus, if the op is being added, we can * ignore it because it hasn't attached itself to the rec - * yet. That means we just need to find the op that has a - * trampoline and is not beeing added. + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. */ do_for_each_ftrace_op(op, ftrace_ops_list) { @@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) if (op->flags & FTRACE_OPS_FL_ADDING) continue; + /* - * If the ops is not being added and has a trampoline, - * then it must be the one that we want! + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. */ - if (hash_contains_ip(ip, op->func_hash)) - return op; - - /* If the ops is being modified, it may be in the old hash. */ if ((op->flags & FTRACE_OPS_FL_MODIFYING) && hash_contains_ip(ip, &op->old_hash)) return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; } while_for_each_ftrace_op(op); -- cgit v1.2.3 From 6891c4509c792209c44ced55a60f13954cb50ef4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 4 Oct 2014 23:06:39 +0200 Subject: posix-timers: Fix stack info leak in timer_create() If userland creates a timer without specifying a sigevent info, we'll create one ourself, using a stack local variable. Particularly will we use the timer ID as sival_int. But as sigev_value is a union containing a pointer and an int, that assignment will only partially initialize sigev_value on systems where the size of a pointer is bigger than the size of an int. On such systems we'll copy the uninitialized stack bytes from the timer_create() call to userland when the timer actually fires and we're going to deliver the signal. Initialize sigev_value with 0 to plug the stack info leak. Found in the PaX patch, written by the PaX Team. Fixes: 5a9fa7307285 ("posix-timers: kill ->it_sigev_signo and...") Signed-off-by: Mathias Krause Cc: Oleg Nesterov Cc: Brad Spengler Cc: PaX Team Cc: # v2.6.28+ Link: http://lkml.kernel.org/r/1412456799-32339-1-git-send-email-minipli@googlemail.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; -- cgit v1.2.3 From 10632008b9e18b76cbff0ffc69c15e948aa548e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2014 15:07:50 +0400 Subject: clockevents: Prevent shift out of bounds Andrey reported that on a kernel with UBSan enabled he found: UBSan: Undefined behaviour in ../kernel/time/clockevents.c:75:34 I guess it should be 1ULL here instead of 1U: (!ismax || evt->mult <= (1U << evt->shift))) That's indeed the correct solution because shift might be 32. Reported-by: Andrey Ryabinin Cc: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); -- cgit v1.2.3 From 993b2ff221999066fcff231590593d0b98f45d32 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Oct 2014 20:27:00 -0700 Subject: futex: Mention key referencing differences between shared and private futexes Update our documentation as of fix 76835b0ebf8 (futex: Ensure get_futex_key_refs() always implies a barrier). Explicitly state that we don't do key referencing for private futexes. Signed-off-by: Davidlohr Bueso Cc: Matteo Franchin Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Darren Hart Cc: Peter Zijlstra Cc: Paul E. McKenney Acked-by: Catalin Marinas Link: http://lkml.kernel.org/r/1414121220.817.0.camel@linux-t7sj.site Signed-off-by: Thomas Gleixner --- kernel/futex.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f3a3a071283c..bbf071f325b8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key) futex_get_mm(key); /* implies MB (B) */ break; default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { -- cgit v1.2.3 From 30a6b8031fe14031ab27c1fa3483cb9780e7f63c Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Sat, 25 Oct 2014 20:20:37 -0400 Subject: futex: Fix a race condition between REQUEUE_PI and task death free_pi_state and exit_pi_state_list both clean up futex_pi_state's. exit_pi_state_list takes the hb lock first, and most callers of free_pi_state do too. requeue_pi doesn't, which means free_pi_state can free the pi_state out from under exit_pi_state_list. For example: task A | task B exit_pi_state_list | pi_state = | curr->pi_state_list->next | | futex_requeue(requeue_pi=1) | // pi_state is the same as | // the one in task A | free_pi_state(pi_state) | list_del_init(&pi_state->list) | kfree(pi_state) list_del_init(&pi_state->list) | Move the free_pi_state calls in requeue_pi to before it drops the hb locks which it's already holding. [ tglx: Removed a pointless free_pi_state() call and the hb->lock held debugging. The latter comes via a seperate patch ] Signed-off-by: Brian Silverman Cc: austin.linux@gmail.com Cc: darren@dvhart.com Cc: peterz@infradead.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1414282837-23092-1-git-send-email-bsilver16384@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bbf071f325b8..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -647,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1527,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1625,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1640,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1716,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1733,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } -- cgit v1.2.3 From 94fb823fcb4892614f57e59601bb9d4920f24711 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 Oct 2014 20:29:10 +0300 Subject: PM / Sleep: fix recovery during resuming from hibernation If a device's dev_pm_ops::freeze callback fails during the QUIESCE phase, we don't rollback things correctly calling the thaw and complete callbacks. This could leave some devices in a suspended state in case of an error during resuming from hibernation. Signed-off-by: Imre Deak Cc: All applicable Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..1f35a3478f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); -- cgit v1.2.3 From f89b7755f517cdbb755d7543eef986ee9d54e654 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 23 Oct 2014 18:41:08 -0700 Subject: bpf: split eBPF out of NET introduce two configs: - hidden CONFIG_BPF to select eBPF interpreter that classic socket filters depend on - visible CONFIG_BPF_SYSCALL (default off) that tracing and sockets can use that solves several problems: - tracing and others that wish to use eBPF don't need to depend on NET. They can use BPF_SYSCALL to allow loading from userspace or select BPF to use it directly from kernel in NET-less configs. - in 3.18 programs cannot be attached to events yet, so don't force it on - when the rest of eBPF infra is there in 3.19+, it's still useful to switch it off to minimize kernel size bloat-o-meter on x64 shows: add/remove: 0/60 grow/shrink: 0/2 up/down: 0/-15601 (-15601) tested with many different config combinations. Hopefully didn't miss anything. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- init/Kconfig | 14 ++++++++++++++ kernel/Makefile | 2 +- kernel/bpf/Makefile | 6 +++--- kernel/bpf/core.c | 9 +++++++++ net/Kconfig | 2 +- 5 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 3ee28ae02cc8..2081a4d3d917 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,10 @@ config SYSCTL_ARCH_UNALIGN_ALLOW config HAVE_PCSPKR_PLATFORM bool +# interpreter that classic socket filters depend on +config BPF + bool + menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible @@ -1521,6 +1525,16 @@ config EVENTFD If unsure, say Y. +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" if EXPERT + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + config SHMEM bool "Use full shmem filesystem" if EXPERT default y diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..17ea6d4a9a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 45427239f375..0daf7f6ae7df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ -obj-y := core.o syscall.o verifier.o - +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o ifdef CONFIG_TEST_BPF -obj-y += test_stub.o +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0c30c59b317..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp) schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/net/Kconfig b/net/Kconfig index 6272420a721b..99815b5454bf 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -6,7 +6,7 @@ menuconfig NET bool "Networking support" select NLATTR select GENERIC_NET_UTILS - select ANON_INODES + select BPF ---help--- Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even -- cgit v1.2.3 From eeb61e53ea19be0c4015b00b2e8b3b2185436f2b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Oct 2014 14:18:25 +0400 Subject: sched: Fix race between task_group and sched_task_group The race may happen when somebody is changing task_group of a forking task. Child's cgroup is the same as parent's after dup_task_struct() (there just memory copying). Also, cfs_rq and rt_rq are the same as parent's. But if parent changes its task_group before it's called cgroup_post_fork(), we do not reflect this situation on child. Child's cfs_rq and rt_rq remain the same, while child's task_group changes in cgroup_post_fork(). To fix this we introduce fork() method, which calls sched_move_task() directly. This function changes sched_task_group on appropriate (also its logic has no problem with freshly created tasks, so we shouldn't introduce something special; we are able just to use it). Possibly, this decides the Burke Libbey's problem: https://lkml.org/lkml/2014/10/24/456 Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414405105.19914.169.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1bf..dde8adb7d0c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7833,6 +7833,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8205,6 +8210,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, -- cgit v1.2.3 From 64be6f1f5f710f5995d41caf8a1767fe6d2b5a87 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:37 +0100 Subject: sched/deadline: Don't replenish from a !SCHED_DEADLINE entity In the deboost path, right after the dl_boosted flag has been reset, we can currently end up replenishing using -deadline parameters of a !SCHED_DEADLINE entity. This of course causes a bug, as those parameters are empty. In the case depicted above it is safe to simply bail out, as the deboosted task is going to be back to its original scheduling class anyway. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Link: http://lkml.kernel.org/r/1414142198-18552-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..92279eaf0ef2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -847,8 +847,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted -- cgit v1.2.3 From aee38ea95419c818dfdde52b115aeffe9cbb259b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:38 +0100 Subject: sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer() dl_task_timer() is racy against several paths. Daniel noticed that the replenishment timer may experience a race condition against an enqueue_dl_entity() called from rt_mutex_setprio(). With his own words: rt_mutex_setprio() resets p->dl.dl_throttled. So the pattern is: start_dl_timer() throttled = 1, rt_mutex_setprio() throlled = 0, sched_switch() -> enqueue_task(), dl_task_timer-> enqueue_task() throttled is 0 => BUG_ON(on_dl_rq(dl_se)) fires as the scheduling entity is already enqueued on the -deadline runqueue. As we do for the other races, we just bail out in the replenishment timer code. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414142198-18552-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 92279eaf0ef2..46167899d852 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); -- cgit v1.2.3 From 1effd9f19324efb05fccc7421530e11a52db0278 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 22 Oct 2014 11:17:11 +0400 Subject: sched/numa: Fix unsafe get_task_struct() in task_numa_assign() Unlocked access to dst_rq->curr in task_numa_compare() is racy. If curr task is exiting this may be a reason of use-after-free: task_numa_compare() do_exit() ... current->flags |= PF_EXITING; ... release_task() ... ~~delayed_put_task_struct()~~ ... schedule() rcu_read_lock() ... cur = ACCESS_ONCE(dst_rq->curr) ... ... rq->curr = next; ... context_switch() ... finish_task_switch() ... put_task_struct() ... __put_task_struct() ... free_task_struct() task_numa_assign() ... get_task_struct() ... As noted by Oleg: <task_numa_assign() path does get_task_struct(dst_rq->curr) and this is not safe. The task_struct itself can't go away, but rcu_read_lock() can't save us from the final put_task_struct() in finish_task_switch(); this reference goes away without rcu gp>> The patch provides simple check of PF_EXITING flag. If it's not set, this guarantees that call_rcu() of delayed_put_task_struct() callback hasn't happened yet, so we can safely do get_task_struct() in task_numa_assign(). Locked dst_rq->lock protects from concurrency with the last schedule(). Reusing or unmapping of cur's memory may happen without it. Suggested-by: Oleg Nesterov Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413962231.19914.130.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e708..fbc0b8214af0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1164,9 +1164,19 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); /* * "imp" is the fault differential for the source task between the -- cgit v1.2.3 From 2847c90e1b3ae95379af24894fc4f98e7f2fd705 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 22 Oct 2014 16:04:35 +0900 Subject: sched/fair: Care divide error in update_task_scan_period() While offling node by hot removing memory, the following divide error occurs: divide error: 0000 [#1] SMP [...] Call Trace: [...] handle_mm_fault [...] ? try_to_wake_up [...] ? wake_up_state [...] __do_page_fault [...] ? do_futex [...] ? put_prev_entity [...] ? __switch_to [...] do_page_fault [...] page_fault [...] RIP [] task_numa_fault RSP The issue occurs as follows: 1. When page fault occurs and page is allocated from node 1, task_struct->numa_faults_buffer_memory[] of node 1 is incremented and p->numa_faults_locality[] is also incremented as follows: o numa_faults_buffer_memory[] o numa_faults_locality[] NR_NUMA_HINT_FAULT_TYPES | 0 | 1 | ---------------------------------- ---------------------- node 0 | 0 | 0 | remote | 0 | node 1 | 0 | 1 | locale | 1 | ---------------------------------- ---------------------- 2. node 1 is offlined by hot removing memory. 3. When page fault occurs, fault_types[] is calculated by using p->numa_faults_buffer_memory[] of all online nodes in task_numa_placement(). But node 1 was offline by step 2. So the fault_types[] is calculated by using only p->numa_faults_buffer_memory[] of node 0. So both of fault_types[] are set to 0. 4. The values(0) of fault_types[] pass to update_task_scan_period(). 5. numa_faults_locality[1] is set to 1. So the following division is calculated. static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private){ ... ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); } 6. But both of private and shared are set to 0. So divide error occurs here. The divide error is rare case because the trigger is node offline. This patch always increments denominator for avoiding divide error. Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/54475703.8000505@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbc0b8214af0..e9abd4e4c5cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1530,7 +1530,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } -- cgit v1.2.3 From 6419265899d9bd27e5ff9f8b43db3715407fc2ba Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 16 Oct 2014 14:39:37 +0400 Subject: sched/fair: Fix division by zero sysctl_numa_balancing_scan_size File /proc/sys/kernel/numa_balancing_scan_size_mb allows writing of zero. This bash command reproduces problem: $ while :; do echo 0 > /proc/sys/kernel/numa_balancing_scan_size_mb; \ echo 256 > /proc/sys/kernel/numa_balancing_scan_size_mb; done divide error: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 24112 Comm: bash Not tainted 3.17.0+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88013c852600 ti: ffff880037a68000 task.ti: ffff880037a68000 RIP: 0010:[] [] task_scan_min+0x21/0x50 RSP: 0000:ffff880037a6bce0 EFLAGS: 00010246 RAX: 0000000000000a00 RBX: 00000000000003e8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88013c852600 RBP: ffff880037a6bcf0 R08: 0000000000000001 R09: 0000000000015c90 R10: ffff880239bf6c00 R11: 0000000000000016 R12: 0000000000003fff R13: ffff88013c852600 R14: ffffea0008d1b000 R15: 0000000000000003 FS: 00007f12bb048700(0000) GS:ffff88007da00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000001505678 CR3: 0000000234770000 CR4: 00000000000006f0 Stack: ffff88013c852600 0000000000003fff ffff880037a6bd18 ffffffff810741d1 ffff88013c852600 0000000000003fff 000000000002bfff ffff880037a6bda8 ffffffff81077ef7 ffffea0008a56d40 0000000000000001 0000000000000001 Call Trace: [] task_scan_max+0x11/0x40 [] task_numa_fault+0x1f7/0xae0 [] ? migrate_misplaced_page+0x276/0x300 [] handle_mm_fault+0x62d/0xba0 [] __do_page_fault+0x191/0x510 [] ? native_smp_send_reschedule+0x42/0x60 [] ? check_preempt_curr+0x80/0xa0 [] ? wake_up_new_task+0x11c/0x1a0 [] ? do_fork+0x14d/0x340 [] ? get_unused_fd_flags+0x2b/0x30 [] ? __fd_install+0x1f/0x60 [] do_page_fault+0xc/0x10 [] page_fault+0x22/0x30 RIP [] task_scan_min+0x21/0x50 RSP ---[ end trace 9a826d16936c04de ]--- Also fix race in task_scan_min (it depends on compiler behaviour). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Tomlin Cc: Andrew Morton Cc: Dario Faggioli Cc: David Rientjes Cc: Jens Axboe Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Rik van Riel Link: http://lkml.kernel.org/r/1413455977.24793.78.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- kernel/sysctl.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9abd4e4c5cb..34baa60f8a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe74..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", -- cgit v1.2.3 From 009f60e2763568cdcd75bd1cf360c7c7165e2e60 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 5 Oct 2014 22:23:22 +0200 Subject: sched: stop the unbound recursion in preempt_schedule_context() preempt_schedule_context() does preempt_enable_notrace() at the end and this can call the same function again; exception_exit() is heavy and it is quite possible that need-resched is true again. 1. Change this code to dec preempt_count() and check need_resched() by hand. 2. As Linus suggested, we can use the PREEMPT_ACTIVE bit and avoid the enable/disable dance around __schedule(). But in this case we need to move into sched/core.c. 3. Cosmetic, but x86 forgets to declare this function. This doesn't really matter because it is only called by asm helpers, still it make sense to add the declaration into asm/preempt.h to match preempt_schedule(). Reported-by: Sasha Levin Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Graf Cc: Andrew Morton Cc: Christoph Lameter Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Steven Rostedt Cc: Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Chuck Ebbert Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20141005202322.GB27962@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 1 + kernel/context_tracking.c | 40 ---------------------------------------- kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bfe..400873450e33 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -105,6 +105,7 @@ static __always_inline bool should_resched(void) # ifdef CONFIG_CONTEXT_TRACKING extern asmlinkage void ___preempt_schedule_context(void); # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") + extern asmlinkage void preempt_schedule_context(void); # endif #endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dde8adb7d0c0..240157c13ddc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* -- cgit v1.2.3 From f3a7e1a9c464a32ee186ab91388313c82e7ce018 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Oct 2014 20:35:56 +0400 Subject: sched/dl: Fix preemption checks 1) switched_to_dl() check is wrong. We reschedule only if rq->curr is deadline task, and we do not reschedule if it's a lower priority task. But we must always preempt a task of other classes. 2) dl_task_timer(): Policy does not change in case of priority inheritance. rt_mutex_setprio() changes prio, while policy remains old. So we lose some balancing logic in dl_task_timer() and switched_to_dl() when we check policy instead of priority. Boosted task may be rq->curr. (I didn't change switched_from_dl() because no check is necessary there at all). I've looked at this place(switched_to_dl) several times and even fixed this function, but found just now... I suppose some performance tests may work better after this. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413909356.19914.128.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 46167899d852..5285332392d5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -540,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -1626,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } -- cgit v1.2.3 From c719f56092add9b3d4192f57c64ce7af11105130 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Oct 2014 11:10:21 +0200 Subject: perf: Fix and clean up initialization of pmu::event_idx Andy reported that the current state of event_idx is rather confused. So remove all but the x86_pmu implementation and change the default to return 0 (the safe option). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Christoph Lameter Cc: Cody P Schafer Cc: Cody P Schafer Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Himangi Saraogi Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: sukadev@linux.vnet.ibm.com Cc: Thomas Huth Cc: Vince Weaver Cc: linux390@de.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/perf/hv-24x7.c | 6 ------ arch/powerpc/perf/hv-gpci.c | 6 ------ arch/s390/kernel/perf_cpum_sf.c | 6 ------ kernel/events/core.c | 15 +-------------- kernel/events/hw_breakpoint.c | 7 ------- 5 files changed, 1 insertion(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 6c8710dd90c9..dba34088da28 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_event *event, int flags) return 0; } -static int h_24x7_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_24x7_pmu = { .task_ctx_nr = perf_invalid_context, @@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = { .start = h_24x7_event_start, .stop = h_24x7_event_stop, .read = h_24x7_event_update, - .event_idx = h_24x7_event_idx, }; static int hv_24x7_init(void) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 15fc76c93022..a051fe946c63 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf_event *event) return 0; } -static int h_gpci_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_gpci_pmu = { .task_ctx_nr = perf_invalid_context, @@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = { .start = h_gpci_event_start, .stop = h_gpci_event_stop, .read = h_gpci_event_update, - .event_idx = h_gpci_event_idx, }; static int hv_gpci_init(void) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 08e761318c17..b878f12a9597 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags) perf_pmu_enable(event->pmu); } -static int cpumsf_pmu_event_idx(struct perf_event *event) -{ - return event->hw.idx; -} - CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); @@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = { .stop = cpumsf_pmu_stop, .read = cpumsf_pmu_read, - .event_idx = cpumsf_pmu_event_idx, .attr_groups = cpumsf_pmu_attr_groups, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1425d07018de..2b02c9fda790 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From d7e29933969e5ca7c112ce1368a07911f4485dc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 09:15:54 -0700 Subject: rcu: Make rcu_barrier() understand about missing rcuo kthreads Commit 35ce7f29a44a (rcu: Create rcuo kthreads only for onlined CPUs) avoids creating rcuo kthreads for CPUs that never come online. This fixes a bug in many instances of firmware: Instead of lying about their age, these systems instead lie about the number of CPUs that they have. Before commit 35ce7f29a44a, this could result in huge numbers of useless rcuo kthreads being created. It appears that experience indicates that I should have told the people suffering from this problem to fix their broken firmware, but I instead produced what turned out to be a partial fix. The missing piece supplied by this commit makes sure that rcu_barrier() knows not to post callbacks for no-CBs CPUs that have not yet come online, because otherwise rcu_barrier() will hang on systems having firmware that lies about the number of CPUs. It is tempting to simply have rcu_barrier() refuse to post a callback on any no-CBs CPU that does not have an rcuo kthread. This unfortunately does not work because rcu_barrier() is required to wait for all pending callbacks. It is therefore required to wait even for those callbacks that cannot possibly be invoked. Even if doing so hangs the system. Given that posting a callback to a no-CBs CPU that does not yet have an rcuo kthread can hang rcu_barrier(), It is tempting to report an error in this case. Unfortunately, this will result in false positives at boot time, when it is perfectly legal to post callbacks to the boot CPU before the scheduler has started, in other words, before it is legal to invoke rcu_barrier(). So this commit instead has rcu_barrier() avoid posting callbacks to CPUs having neither rcuo kthread nor pending callbacks, and has it complain bitterly if it finds CPUs having no rcuo kthread but some pending callbacks. And when rcu_barrier() does find CPUs having no rcuo kthread but pending callbacks, as noted earlier, it has no choice but to hang indefinitely. Reported-by: Yanko Kaneti Reported-by: Jay Vosburgh Reported-by: Meelis Roos Reported-by: Eric B Munson Signed-off-by: Paul E. McKenney Tested-by: Eric B Munson Tested-by: Jay Vosburgh Tested-by: Yanko Kaneti Tested-by: Kevin Fenzi Tested-by: Meelis Roos --- include/trace/events/rcu.h | 18 +++++++++--------- kernel/rcu/tree.c | 15 ++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9b56f37148cf..e335e7d8c6c2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -660,18 +660,18 @@ TRACE_EVENT(rcu_torture_read, /* * Tracepoint for _rcu_barrier() execution. The string "s" describes * the _rcu_barrier phase: - * "Begin": rcu_barrier_callback() started. - * "Check": rcu_barrier_callback() checking for piggybacking. - * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. - * "Inc1": rcu_barrier_callback() piggyback check counter incremented. - * "Offline": rcu_barrier_callback() found offline CPU - * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. - * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. - * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. + * "Begin": _rcu_barrier() started. + * "Check": _rcu_barrier() checking for piggybacking. + * "EarlyExit": _rcu_barrier() piggybacked, thus early exit. + * "Inc1": _rcu_barrier() piggyback check counter incremented. + * "OfflineNoCB": _rcu_barrier() found callback on never-online CPU + * "OnlineNoCB": _rcu_barrier() found online no-CBs CPU. + * "OnlineQ": _rcu_barrier() found online CPU with callbacks. + * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. - * "Inc2": rcu_barrier_callback() piggyback check counter incremented. + * "Inc2": _rcu_barrier() piggyback check counter incremented. * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument * is the count of remaining callbacks, and "done" is the piggybacking count. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e47223095..9815447d22e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d03764652d91..bbdc45d8d74f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd4599344..c1d7f27bd38f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2049,6 +2049,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } } +/* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + /* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the @@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit v1.2.3 From f601de204465048bdf0d5537f630729622ebc3a6 Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Wed, 29 Oct 2014 14:50:24 -0700 Subject: gcov: add ARM64 to GCOV_PROFILE_ALL Following up the arm testing of gcov, turns out gcov on ARM64 works fine as well. Only change needed is adding ARM64 to Kconfig depends. Tested with qemu and mach-virt Signed-off-by: Riku Voipio Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index cf66c5c8458e..3b7408759bdf 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 0baf2a4dbf75abb7c186fd6c8d55d27aaa354a29 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 29 Oct 2014 14:50:35 -0700 Subject: kernel/kmod: fix use-after-free of the sub_info structure Found this in the message log on a s390 system: BUG kmalloc-192 (Not tainted): Poison overwritten Disabling lock debugging due to kernel taint INFO: 0x00000000684761f4-0x00000000684761f7. First byte 0xff instead of 0x6b INFO: Allocated in call_usermodehelper_setup+0x70/0x128 age=71 cpu=2 pid=648 __slab_alloc.isra.47.constprop.56+0x5f6/0x658 kmem_cache_alloc_trace+0x106/0x408 call_usermodehelper_setup+0x70/0x128 call_usermodehelper+0x62/0x90 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc INFO: Freed in call_usermodehelper_exec+0x110/0x1b8 age=71 cpu=2 pid=648 __slab_free+0x94/0x560 kfree+0x364/0x3e0 call_usermodehelper_exec+0x110/0x1b8 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc There is a use-after-free bug on the subprocess_info structure allocated by the user mode helper. In case do_execve() returns with an error ____call_usermodehelper() stores the error code to sub_info->retval, but sub_info can already have been freed. Regarding UMH_NO_WAIT, the sub_info structure can be freed by __call_usermodehelper() before the worker thread returns from do_execve(), allowing memory corruption when do_execve() failed after exec_mmap() is called. Regarding UMH_WAIT_EXEC, the call to umh_complete() allows call_usermodehelper_exec() to continue which then frees sub_info. To fix this race the code needs to make sure that the call to call_usermodehelper_freeinfo() is always done after the last store to sub_info->retval. Signed-off-by: Martin Schwidefsky Reviewed-by: Oleg Nesterov Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 76 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..80f7a6d00519 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (wait != UMH_WAIT_PROC) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } @@ -258,26 +281,6 @@ static int call_helper(void *data) return ____call_usermodehelper(data); } -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) kmod_thread_locker = NULL; } - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } - sub_info->complete = &done; + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); -- cgit v1.2.3 From 086ba77a6db00ed858ff07451bedee197df868c9 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 29 Oct 2014 23:06:58 +0100 Subject: tracing/syscalls: Ignore numbers outside NR_syscalls' range ARM has some private syscalls (for example, set_tls(2)) which lie outside the range of NR_syscalls. If any of these are called while syscall tracing is being performed, out-of-bounds array access will occur in the ftrace and perf sys_{enter,exit} handlers. # trace-cmd record -e raw_syscalls:* true && trace-cmd report ... true-653 [000] 384.675777: sys_enter: NR 192 (0, 1000, 3, 4000022, ffffffff, 0) true-653 [000] 384.675812: sys_exit: NR 192 = 1995915264 true-653 [000] 384.675971: sys_enter: NR 983045 (76f74480, 76f74000, 76f74b28, 76f74480, 76f76f74, 1) true-653 [000] 384.675988: sys_exit: NR 983045 = 0 ... # trace-cmd record -e syscalls:* true [ 17.289329] Unable to handle kernel paging request at virtual address aaaaaace [ 17.289590] pgd = 9e71c000 [ 17.289696] [aaaaaace] *pgd=00000000 [ 17.289985] Internal error: Oops: 5 [#1] PREEMPT SMP ARM [ 17.290169] Modules linked in: [ 17.290391] CPU: 0 PID: 704 Comm: true Not tainted 3.18.0-rc2+ #21 [ 17.290585] task: 9f4dab00 ti: 9e710000 task.ti: 9e710000 [ 17.290747] PC is at ftrace_syscall_enter+0x48/0x1f8 [ 17.290866] LR is at syscall_trace_enter+0x124/0x184 Fix this by ignoring out-of-NR_syscalls-bounds syscall numbers. Commit cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" added the check for less than zero, but it should have also checked for greater than NR_syscalls. Link: http://lkml.kernel.org/p/1414620418-29472-1-git-send-email-rabin@rab.in Fixes: cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4dc8b79c5f75..29228c4d5696 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; -- cgit v1.2.3