diff options
Diffstat (limited to 'kernel')
43 files changed, 1739 insertions, 1020 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d1574d47cf27..271fd3119af9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -176,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ - -keyout signing_key.priv + -keyout signing_key.priv 2>&1 @echo "###" @echo "### Key pair generated." @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index 85389fe2abd0..8d6e145138bb 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -540,10 +540,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_swaps = encode_comp_t(0); /* + * Get freeze protection. If the fs is frozen, just skip the write + * as we could deadlock the system otherwise. + */ + if (!file_start_write_trylock(file)) + goto out; + /* * Kernel segment override to datasegment and write it * to the accounting file. */ - file_start_write(file); fs = get_fs(); set_fs(KERNEL_DS); /* diff --git a/kernel/audit.c b/kernel/audit.c index 0b084fa44b1f..21c7fa615bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -49,6 +49,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/syscalls.h> #include <linux/audit.h> @@ -265,7 +267,6 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -274,29 +275,17 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, from_kuid(&init_user_ns, loginuid), sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) { - audit_log_format(ab, " sid=%u", sid); - allow_changes = 0; /* Something weird, deny request */ - } else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(ab, "%s=%d old=%d", function_name, new, old); + audit_log_session_info(ab); + rc = audit_log_task_context(ab); + if (rc) + allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } -static int audit_do_config_change(char *function_name, int *to_change, - int new, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_do_config_change(char *function_name, int *to_change, int new) { int allow_changes, rc = 0, old = *to_change; @@ -307,8 +296,7 @@ static int audit_do_config_change(char *function_name, int *to_change, allow_changes = 1; if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, loginuid, - sessionid, sid, allow_changes); + rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } @@ -322,44 +310,37 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_rate_limit(int limit) { - return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } -static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_backlog_limit(int limit) { - return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } -static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) return -EINVAL; - rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sessionid, sid); - + rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } -static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; - return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sessionid, sid); + return audit_do_config_change("audit_failure", &audit_failure, state); } /* @@ -417,34 +398,53 @@ static void kauditd_send_skb(struct sk_buff *skb) consume_skb(skb); } -static int kauditd_thread(void *dummy) +/* + * flush_hold_queue - empty the hold queue if auditd appears + * + * If auditd just started, drain the queue of messages already + * sent to syslog/printk. Remember loss here is ok. We already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * If you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ +static void flush_hold_queue(void) { struct sk_buff *skb; + if (!audit_default || !audit_pid) + return; + + skb = skb_dequeue(&audit_skb_hold_queue); + if (likely(!skb)) + return; + + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + + /* + * if auditd just disappeared but we + * dequeued an skb we need to drop ref + */ + if (skb) + consume_skb(skb); +} + +static int kauditd_thread(void *dummy) +{ set_freezable(); while (!kthread_should_stop()) { - /* - * if auditd just started drain the queue of messages already - * sent to syslog/printk. remember loss here is ok. we already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. - * - * if you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. - */ - if (audit_default && audit_pid) { - skb = skb_dequeue(&audit_skb_hold_queue); - if (unlikely(skb)) { - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } - } - } + struct sk_buff *skb; + DECLARE_WAITQUEUE(wait, current); + + flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); @@ -453,19 +453,18 @@ static int kauditd_thread(void *dummy) kauditd_send_skb(skb); else audit_printk_skb(skb); - } else { - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); + schedule(); } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); } return 0; } @@ -579,13 +578,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EPERM; switch (msg_type) { - case AUDIT_GET: case AUDIT_LIST: - case AUDIT_LIST_RULES: - case AUDIT_SET: case AUDIT_ADD: - case AUDIT_ADD_RULE: case AUDIT_DEL: + return -EOPNOTSUPP; + case AUDIT_GET: + case AUDIT_SET: + case AUDIT_LIST_RULES: + case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: @@ -608,12 +608,10 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - kuid_t auid, u32 ses, u32 sid) +static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; - char *ctx = NULL; - u32 len; + uid_t uid = from_kuid(&init_user_ns, current_uid()); if (!audit_enabled) { *ab = NULL; @@ -623,33 +621,21 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - task_tgid_vnr(current), - from_kuid(&init_user_ns, current_uid()), - from_kuid(&init_user_ns, auid), ses); - if (sid) { - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) - audit_log_format(*ab, " ssid=%u", sid); - else { - audit_log_format(*ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_session_info(*ab); + audit_log_task_context(*ab); return rc; } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 seq, sid; + u32 seq; void *data; struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - kuid_t loginuid; /* loginuid of sender */ - u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; @@ -668,9 +654,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } } - loginuid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -691,14 +674,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, - loginuid, sessionid, sid); + err = audit_set_enabled(status_get->enabled); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, - loginuid, sessionid, sid); + err = audit_set_failure(status_get->failure); if (err < 0) return err; } @@ -706,22 +687,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int new_pid = status_get->pid; if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - audit_pid, loginuid, - sessionid, sid, 1); - + audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sessionid, sid); + err = audit_set_rate_limit(status_get->rate_limit); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sessionid, sid); + err = audit_set_backlog_limit(status_get->backlog_limit); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: @@ -729,25 +705,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(); + err = audit_filter_user(msg_type); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_task(current, loginuid, - sessionid); + err = tty_audit_push_current(); if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.1024s'", (char *)data); else { int size; - audit_log_format(ab, " msg="); + audit_log_format(ab, " data="); size = nlmsg_len(nlh); if (size > 0 && ((unsigned char *)data)[size - 1] == '\0') @@ -758,50 +731,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); } break; - case AUDIT_ADD: - case AUDIT_DEL: - if (nlmsg_len(nlh) < sizeof(struct audit_rule)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); audit_log_end(ab); return -EPERM; } /* fallthrough */ case AUDIT_LIST_RULES: err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); + seq, data, nlmsg_len(nlh)); break; case AUDIT_TRIM: audit_trim_trees(); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -831,8 +778,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -871,27 +817,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_tty_status s; struct task_struct *tsk = current; - spin_lock_irq(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); + s.log_passwd = tsk->signal->audit_tty_log_passwd; + spin_unlock(&tsk->sighand->siglock); audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { - struct audit_tty_status *s; + struct audit_tty_status s; struct task_struct *tsk = current; - if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) - return -EINVAL; - s = data; - if (s->enabled != 0 && s->enabled != 1) + memset(&s, 0, sizeof(s)); + /* guard against past and future API changes */ + memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + if ((s.enabled != 0 && s.enabled != 1) || + (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; - spin_lock_irq(&tsk->sighand->siglock); - tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); + tsk->signal->audit_tty = s.enabled; + tsk->signal->audit_tty_log_passwd = s.log_passwd; + spin_unlock(&tsk->sighand->siglock); break; } default: @@ -1434,6 +1383,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_session_info(struct audit_buffer *ab) +{ + u32 sessionid = audit_get_sessionid(current); + uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + + audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); +} + void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); @@ -1443,6 +1400,224 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", + cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); +} + +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); +} + +/** + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +void audit_log_name(struct audit_context *context, struct audit_names *n, + struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + +int audit_log_task_context(struct audit_buffer *ab) +{ + char *ctx = NULL; + unsigned len; + int error; + u32 sid; + + security_task_getsecid(current, &sid); + if (!sid) + return 0; + + error = security_secid_to_secctx(sid, &ctx, &len); + if (error) { + if (error != -EINVAL) + goto error_path; + return 0; + } + + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + return 0; + +error_path: + audit_panic("error in audit_log_task_context"); + return error; +} +EXPORT_SYMBOL(audit_log_task_context); + +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +{ + const struct cred *cred; + char name[sizeof(tsk->comm)]; + struct mm_struct *mm = tsk->mm; + char *tty; + + if (!ab) + return; + + /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + from_kuid(&init_user_ns, audit_get_loginuid(tsk)), + from_kuid(&init_user_ns, cred->uid), + from_kgid(&init_user_ns, cred->gid), + from_kuid(&init_user_ns, cred->euid), + from_kuid(&init_user_ns, cred->suid), + from_kuid(&init_user_ns, cred->fsuid), + from_kgid(&init_user_ns, cred->egid), + from_kgid(&init_user_ns, cred->sgid), + from_kgid(&init_user_ns, cred->fsgid), + audit_get_sessionid(tsk), tty); + + get_task_comm(name, tsk); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); + + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); + up_read(&mm->mmap_sem); + } + audit_log_task_context(ab); +} +EXPORT_SYMBOL(audit_log_task_info); + /** * audit_log_link_denied - report a link restriction denial * @operation: specific link opreation @@ -1451,19 +1626,28 @@ void audit_log_key(struct audit_buffer *ab, char *key) void audit_log_link_denied(const char *operation, struct path *link) { struct audit_buffer *ab; + struct audit_names *name; + + name = kzalloc(sizeof(*name), GFP_NOFS); + if (!name) + return; + /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - return; - audit_log_format(ab, "op=%s action=denied", operation); - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_d_path(ab, " path=", link); - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + goto out; + audit_log_format(ab, "op=%s", operation); + audit_log_task_info(ab, current); + audit_log_format(ab, " res=0"); audit_log_end(ab); + + /* Generate AUDIT_PATH record with object. */ + name->type = AUDIT_TYPE_NORMAL; + audit_copy_inode(name, link->dentry, link->dentry->d_inode); + audit_log_name(current->audit_context, name, link, 0, NULL); +out: + kfree(name); } /** diff --git a/kernel/audit.h b/kernel/audit.h index 11468d99dad0..1c95131ef760 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/audit.h> #include <linux/skbuff.h> +#include <uapi/linux/mqueue.h> /* 0 = no checking 1 = put_count checking @@ -29,6 +30,11 @@ */ #define AUDIT_DEBUG 0 +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 + /* At task start time, the audit_state is set in the audit_context using a per-task filter. At syscall entry, the audit_state is augmented by the syscall filter. */ @@ -59,8 +65,158 @@ struct audit_entry { struct audit_krule rule; }; +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of file cap */ + kernel_cap_t effective; /* effective set of process */ + }; +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ +struct audit_names { + struct list_head list; /* audit_context->names_list */ + + struct filename *name; + int name_len; /* number of chars to log */ + bool name_put; /* call __putname()? */ + + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + unsigned char type; /* record type */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit. + */ + bool should_free; +}; + +/* The per-task audit context. */ +struct audit_context { + int dummy; /* must be the first element */ + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state, current_state; + unsigned int serial; /* serial number for record */ + int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ + unsigned long argv[4]; /* syscall arguments */ + long return_code;/* syscall return code */ + u64 prio; + int return_valid; /* return code is valid */ + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* struct audit_names->list anchor */ + char *filterkey; /* key for rule that triggered record */ + struct path pwd; + struct audit_aux_data *aux; + struct audit_aux_data *aux_pids; + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; + /* Save things to print about task_struct */ + pid_t pid, ppid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; + unsigned long personality; + int arch; + + pid_t target_pid; + kuid_t target_auid; + kuid_t target_uid; + unsigned int target_sessionid; + u32 target_sid; + char target_comm[TASK_COMM_LEN]; + + struct audit_tree_refs *trees, *first_trees; + struct list_head killed_trees; + int tree_count; + + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + kuid_t uid; + kgid_t gid; + umode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + umode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + umode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + struct { + int fd; + int flags; + } mmap; + }; + int fds[2]; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + extern int audit_ever_enabled; +extern void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + const struct inode *inode); +extern void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap); +extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); +extern void audit_log_name(struct audit_context *context, + struct audit_names *n, struct path *path, + int record_num, int *call_panic); + extern int audit_pid; #define AUDIT_INODE_BUCKETS 32 diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 267436826c3b..83a2970295d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -310,121 +310,83 @@ static u32 audit_to_op(u32 op) return n; } - -/* Translate struct audit_rule to kernel's rule respresentation. - * Exists for backward compatibility with userspace. */ -static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) +/* check if an audit field is valid */ +static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { - struct audit_entry *entry; - int err = 0; - int i; - - entry = audit_to_entry_common(rule); - if (IS_ERR(entry)) - goto exit_nofree; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - u32 n; - - n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (n & AUDIT_NEGATE) - f->op = Audit_not_equal; - else if (!n) - f->op = Audit_equal; - else - f->op = audit_to_op(n); - - entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - - f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); - f->val = rule->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - - err = -EINVAL; - if (f->op == Audit_bad) - goto exit_free; - - switch(f->type) { - default: - goto exit_free; - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_LOGINUID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - - f->uid = make_kuid(current_user_ns(), f->val); - if (!uid_valid(f->uid)) - goto exit_free; - break; - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - - f->gid = make_kgid(current_user_ns(), f->val); - if (!gid_valid(f->gid)) - goto exit_free; - break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - /* bit ops are only useful on syscall args */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - /* arch is only allowed to be = or != */ - case AUDIT_ARCH: - if (f->op != Audit_not_equal && f->op != Audit_equal) - goto exit_free; - entry->rule.arch_f = f; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - } - } - - if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) - entry->rule.inode_f = NULL; - -exit_nofree: - return entry; + switch(f->type) { + case AUDIT_MSGTYPE: + if (entry->rule.listnr != AUDIT_FILTER_TYPE && + entry->rule.listnr != AUDIT_FILTER_USER) + return -EINVAL; + break; + }; -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); + switch(f->type) { + default: + return -EINVAL; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + case AUDIT_PID: + case AUDIT_PERS: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + return -EINVAL; + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + case AUDIT_WATCH: + case AUDIT_DIR: + case AUDIT_FILTERKEY: + break; + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + /* FALL THROUGH */ + case AUDIT_ARCH: + if (f->op != Audit_not_equal && f->op != Audit_equal) + return -EINVAL; + break; + case AUDIT_PERM: + if (f->val & ~15) + return -EINVAL; + break; + case AUDIT_FILETYPE: + if (f->val & ~S_IFMT) + return -EINVAL; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) + return -EINVAL; + break; + }; + return 0; } /* Translate struct audit_rule_data to kernel's rule respresentation. */ @@ -459,17 +421,25 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; - switch(f->type) { + + /* Support legacy tests for a valid loginuid */ + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + f->type = AUDIT_LOGINUID_SET; + f->val = 0; + } + + err = audit_field_valid(entry, f); + if (err) + goto exit_free; + + err = -EINVAL; + switch (f->type) { + case AUDIT_LOGINUID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: - case AUDIT_LOGINUID: case AUDIT_OBJ_UID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->uid = make_kuid(current_user_ns(), f->val); if (!uid_valid(f->uid)) goto exit_free; @@ -479,27 +449,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->gid = make_kgid(current_user_ns(), f->val); if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; case AUDIT_ARCH: entry->rule.arch_f = f; break; @@ -570,20 +523,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_FIELD_COMPARE: - if (f->val > AUDIT_MAX_FIELD_COMPARE) - goto exit_free; - break; - default: - goto exit_free; } } @@ -613,36 +552,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule. - * Exists for backward compatibility with userspace. */ -static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) -{ - struct audit_rule *rule; - int i; - - rule = kzalloc(sizeof(*rule), GFP_KERNEL); - if (unlikely(!rule)) - return NULL; - - rule->flags = krule->flags | krule->listnr; - rule->action = krule->action; - rule->field_count = krule->field_count; - for (i = 0; i < rule->field_count; i++) { - rule->values[i] = krule->fields[i].val; - rule->fields[i] = krule->fields[i].type; - - if (krule->vers_ops == 1) { - if (krule->fields[i].op == Audit_not_equal) - rule->fields[i] |= AUDIT_NEGATE; - } else { - rule->fields[i] |= audit_ops[krule->fields[i].op]; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; - - return rule; -} - /* Translate kernel rule respresentation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { @@ -1055,35 +964,6 @@ out: return ret; } -/* List rules using struct audit_rule. Exists for backward - * compatibility with userspace. */ -static void audit_list(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_krule *r; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(r, &audit_rules_list[i], list) { - struct audit_rule *rule; - - rule = audit_krule_to_rule(r); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - if (skb) - skb_queue_tail(q, skb); -} - /* List rules using struct audit_rule_data. */ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { @@ -1113,11 +993,11 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, - char *action, struct audit_krule *rule, - int res) +static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; + uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + u32 sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -1125,18 +1005,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", - from_kuid(&init_user_ns, loginuid), sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); + audit_log_task_context(ab); audit_log_format(ab, " op="); audit_log_string(ab, action); audit_log_key(ab, rule->filterkey); @@ -1155,8 +1025,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int seq, void *data, - size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1164,7 +1033,6 @@ int audit_receive_filter(int type, int pid, int seq, void *data, struct audit_entry *entry; switch (type) { - case AUDIT_LIST: case AUDIT_LIST_RULES: /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1179,10 +1047,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data, skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); - if (type == AUDIT_LIST) - audit_list(pid, seq, &dest->q); - else - audit_list_rules(pid, seq, &dest->q); + audit_list_rules(pid, seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); @@ -1192,35 +1057,23 @@ int audit_receive_filter(int type, int pid, int seq, void *data, err = PTR_ERR(tsk); } break; - case AUDIT_ADD: case AUDIT_ADD_RULE: - if (type == AUDIT_ADD) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add rule", - &entry->rule, !err); - + audit_log_rule_change("add rule", &entry->rule, !err); if (err) audit_free_rule(entry); break; - case AUDIT_DEL: case AUDIT_DEL_RULE: - if (type == AUDIT_DEL) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove rule", - &entry->rule, !err); - + audit_log_rule_change("remove rule", &entry->rule, !err); audit_free_rule(entry); break; default: @@ -1358,7 +1211,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, int type, enum audit_state *state) { int i; @@ -1382,6 +1235,13 @@ static int audit_filter_user_rules(struct audit_krule *rule, result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; + case AUDIT_MSGTYPE: + result = audit_comparator(type, f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -1408,7 +1268,7 @@ static int audit_filter_user_rules(struct audit_krule *rule, return 1; } -int audit_filter_user(void) +int audit_filter_user(int type) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1416,7 +1276,7 @@ int audit_filter_user(void) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(&e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, type, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c68229411a7c..3c8a601324a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -76,11 +76,6 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). If we get more names we will allocate - * a name dynamically and also add those to the list anchored by names_list. */ -#define AUDIT_NAMES 5 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -90,44 +85,6 @@ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; -struct audit_cap_data { - kernel_cap_t permitted; - kernel_cap_t inheritable; - union { - unsigned int fE; /* effective bit of a file capability */ - kernel_cap_t effective; /* effective set of a process */ - }; -}; - -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). - * - * Further, in fs/namei.c:path_lookup() we store the inode and device. - */ -struct audit_names { - struct list_head list; /* audit_context->names_list */ - struct filename *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - unsigned char type; /* record type */ - bool name_put; /* call __putname() for this name */ - /* - * This was an allocated audit_names and not from the array of - * names allocated in the task audit context. Thus this name - * should be freed on syscall exit - */ - bool should_free; -}; - struct audit_aux_data { struct audit_aux_data *next; int type; @@ -175,106 +132,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -/* The per-task audit context. */ -struct audit_context { - int dummy; /* must be the first element */ - int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ - int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ - unsigned long argv[4]; /* syscall arguments */ - long return_code;/* syscall return code */ - u64 prio; - int return_valid; /* return code is valid */ - /* - * The names_list is the list of all audit_names collected during this - * syscall. The first AUDIT_NAMES entries in the names_list will - * actually be from the preallocated_names array for performance - * reasons. Except during allocation they should never be referenced - * through the preallocated_names array and should only be found/used - * by running the names_list. - */ - struct audit_names preallocated_names[AUDIT_NAMES]; - int name_count; /* total records in names_list */ - struct list_head names_list; /* anchor for struct audit_names->list */ - char * filterkey; /* key for rule that triggered record */ - struct path pwd; - struct audit_aux_data *aux; - struct audit_aux_data *aux_pids; - struct sockaddr_storage *sockaddr; - size_t sockaddr_len; - /* Save things to print about task_struct */ - pid_t pid, ppid; - kuid_t uid, euid, suid, fsuid; - kgid_t gid, egid, sgid, fsgid; - unsigned long personality; - int arch; - - pid_t target_pid; - kuid_t target_auid; - kuid_t target_uid; - unsigned int target_sessionid; - u32 target_sid; - char target_comm[TASK_COMM_LEN]; - - struct audit_tree_refs *trees, *first_trees; - struct list_head killed_trees; - int tree_count; - - int type; - union { - struct { - int nargs; - long args[6]; - } socketcall; - struct { - kuid_t uid; - kgid_t gid; - umode_t mode; - u32 osid; - int has_perm; - uid_t perm_uid; - gid_t perm_gid; - umode_t perm_mode; - unsigned long qbytes; - } ipc; - struct { - mqd_t mqdes; - struct mq_attr mqstat; - } mq_getsetattr; - struct { - mqd_t mqdes; - int sigev_signo; - } mq_notify; - struct { - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; - } mq_sendrecv; - struct { - int oflag; - umode_t mode; - struct mq_attr attr; - } mq_open; - struct { - pid_t pid; - struct audit_cap_data cap; - } capset; - struct { - int fd; - int flags; - } mmap; - }; - int fds[2]; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif -}; - static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); @@ -633,9 +490,23 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_group_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_group_p(f->gid); + } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_egroup_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_egroup_p(f->gid); + } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); @@ -742,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -987,6 +861,8 @@ static inline void audit_free_names(struct audit_context *context) #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { + int i = 0; + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -995,7 +871,7 @@ static inline void audit_free_names(struct audit_context *context) context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i, + printk(KERN_ERR "names[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } dump_stack(); @@ -1010,7 +886,7 @@ static inline void audit_free_names(struct audit_context *context) list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name && n->name_put) - __putname(n->name); + final_putname(n->name); if (n->should_free) kfree(n); } @@ -1093,88 +969,6 @@ static inline void audit_free_context(struct audit_context *context) kfree(context); } -void audit_log_task_context(struct audit_buffer *ab) -{ - char *ctx = NULL; - unsigned len; - int error; - u32 sid; - - security_task_getsecid(current, &sid); - if (!sid) - return; - - error = security_secid_to_secctx(sid, &ctx, &len); - if (error) { - if (error != -EINVAL) - goto error_path; - return; - } - - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - return; - -error_path: - audit_panic("error in audit_log_task_context"); - return; -} - -EXPORT_SYMBOL(audit_log_task_context); - -void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) -{ - const struct cred *cred; - char name[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; - char *tty; - - if (!ab) - return; - - /* tsk == current */ - cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - - - audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", - sys_getppid(), - tsk->pid, - from_kuid(&init_user_ns, tsk->loginuid), - from_kuid(&init_user_ns, cred->uid), - from_kgid(&init_user_ns, cred->gid), - from_kuid(&init_user_ns, cred->euid), - from_kuid(&init_user_ns, cred->suid), - from_kuid(&init_user_ns, cred->fsuid), - from_kgid(&init_user_ns, cred->egid), - from_kgid(&init_user_ns, cred->sgid), - from_kgid(&init_user_ns, cred->fsgid), - tsk->sessionid, tty); - - get_task_comm(name, tsk); - audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); - - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } - audit_log_task_context(ab); -} - -EXPORT_SYMBOL(audit_log_task_info); - static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) @@ -1191,12 +985,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (security_secid_to_secctx(sid, &ctx, &len)) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + if (sid) { + if (security_secid_to_secctx(sid, &ctx, &len)) { + audit_log_format(ab, " obj=(none)"); + rc = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); @@ -1390,35 +1186,6 @@ static void audit_log_execve_info(struct audit_context *context, kfree(buf); } -static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); - } -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); -} - static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1516,68 +1283,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } -static void audit_log_name(struct audit_context *context, struct audit_names *n, - int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", record_num); - - if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); -} - static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1695,7 +1400,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts i = 0; list_for_each_entry(n, &context->names_list, list) - audit_log_name(context, n, i++, &call_panic); + audit_log_name(context, n, NULL, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -2030,18 +1735,18 @@ void audit_putname(struct filename *name) BUG_ON(!context); if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; - int i; + int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i, + printk(KERN_ERR "name[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } #endif - __putname(name); + final_putname(name); } #if AUDIT_DEBUG else { @@ -2060,41 +1765,6 @@ void audit_putname(struct filename *name) #endif } -static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - - return 0; -} - - -/* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -2303,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid) unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (uid_valid(task->loginuid)) + if (audit_loginuid_set(task)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2471,17 +2141,20 @@ int __audit_bprm(struct linux_binprm *bprm) /** * audit_socketcall - record audit data for sys_socketcall - * @nargs: number of args + * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ -void __audit_socketcall(int nargs, unsigned long *args) +int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; + if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) + return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); + return 0; } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 3820e3cefbae..6b41c1899a8b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -18,6 +18,7 @@ #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> +#include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -685,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) + if (list_empty(&cpuctx->rotation_list)) { + int was_empty = list_empty(head); list_add(&cpuctx->rotation_list, head); + if (was_empty) + tick_nohz_full_kick(); + } } static void get_ctx(struct perf_event_context *ctx) @@ -2591,6 +2596,16 @@ done: list_del_init(&cpuctx->rotation_list); } +#ifdef CONFIG_NO_HZ_FULL +bool perf_event_can_stop_tick(void) +{ + if (list_empty(&__get_cpu_var(rotation_list))) + return true; + else + return false; +} +#endif + void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 97fddb09762b..cd55144270b5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -326,11 +326,16 @@ void rb_free(struct ring_buffer *rb) } #else +static int data_page_nr(struct ring_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { - if (pgoff > (1UL << page_order(rb))) + /* The '>' counts in the user page. */ + if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); @@ -350,10 +355,11 @@ static void rb_free_work(struct work_struct *work) int i, nr; rb = container_of(work, struct ring_buffer, work); - nr = 1 << page_order(rb); + nr = data_page_nr(rb); base = rb->user_page; - for (i = 0; i < nr + 1; i++) + /* The '<=' counts in the user page. */ + for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); @@ -387,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; rb->page_order = ilog2(nr_pages); - rb->nr_pages = 1; + rb->nr_pages = !!nr_pages; ring_buffer_init(rb, watermark, flags); diff --git a/kernel/fork.c b/kernel/fork.c index 7d40687b1434..987b28a1f01b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -70,6 +70,7 @@ #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> +#include <linux/aio.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif +#ifdef CONFIG_BCACHE + p->sequential_io = 0; + p->sequential_io_avg = 0; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 609d8ff38b74..fd4b13b131f8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -172,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, */ static int hrtimer_get_target(int this_cpu, int pinned) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif @@ -1125,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 96f3a1d9c379..5a83dde8ca0c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. + * + * Since on some platforms we blindly try to map everything + * we end up with a log full of backtraces. + * + * So instead, we silently fail on -EPERM, it is the + * responsibility of the PIC driver to display a relevant + * message if needed. + */ + if (ret != -EPERM) { + pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", + virq, hwirq, ret); + WARN_ON(1); + } irq_data->domain = NULL; irq_data->hwirq = 0; goto err_unmap; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2169feeba529..3127ad52cdb2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr) /* * Expand a compressed symbol data into the resulting uncompressed string, + * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +static unsigned int kallsyms_expand_symbol(unsigned int off, + char *result, size_t maxlen) { int len, skipped_first = 0; const u8 *tptr, *data; @@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) while (*tptr) { if (skipped_first) { + if (maxlen <= 1) + goto tail; *result = *tptr; result++; + maxlen--; } else skipped_first = 1; tptr++; } } - *result = '\0'; +tail: + if (maxlen) + *result = '\0'; /* Return to offset to the next symbol. */ return off; @@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned int off; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) return kallsyms_addresses[i]; @@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); if (ret != 0) return ret; @@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); + kallsyms_expand_symbol(get_symbol_offset(pos), + namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; return namebuf; @@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); + kallsyms_expand_symbol(get_symbol_offset(pos), + symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ @@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, pos = get_symbol_pos(addr, size, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); + kallsyms_expand_symbol(get_symbol_offset(pos), + name, KSYM_NAME_LEN); modname[0] = '\0'; return 0; } @@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) iter->type = kallsyms_get_symbol_type(off); - off = kallsyms_expand_symbol(off, iter->name); + off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6a3bccba7e7d..1f3186b37fd5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static int print_lock_nested_lock_not_held(struct task_struct *curr, diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 246b4c6e6135..4a9a86d12c8b 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S @@ -1,15 +1,8 @@ -/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ -#ifndef SYMBOL_PREFIX -#define ASM_SYMBOL(sym) sym -#else -#define PASTE2(x,y) x##y -#define PASTE(x,y) PASTE2(x,y) -#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) -#endif +#include <linux/export.h> #define GLOBAL(name) \ - .globl ASM_SYMBOL(name); \ - ASM_SYMBOL(name): + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): .section ".init.data","aw" diff --git a/kernel/module.c b/kernel/module.c index 0925c9a71975..b049939177f6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, /* Since this should be found in kernel (which can't be removed), * no locking is necessary. */ - if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc, + return check_version(sechdrs, versindex, + VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); } @@ -1861,12 +1862,12 @@ static void free_module(struct module *mod) { trace_module_free(mod); - /* Delete from various lists */ - mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); - mutex_unlock(&module_mutex); mod_sysfs_teardown(mod); + /* We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. */ + mod->state = MODULE_STATE_UNFORMED; + /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1879,6 +1880,11 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + /* Now we can delete it from the lists */ + mutex_lock(&module_mutex); + stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); + /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); diff --git a/kernel/params.c b/kernel/params.c index ed35345be536..53b958fcd639 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), GFP_KERNEL); if (!new) { - kfree(mk->mp); + kfree(attrs); err = -ENOMEM; goto fail; } + /* Despite looking like the typical realloc() bug, this is safe. + * We *want* the old 'attrs' to be freed either way, and we'll store + * the new one in the success case. */ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); if (!attrs) { err = -ENOMEM; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c9bb58..42670e9b44e0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -10,6 +10,8 @@ #include <linux/kernel_stat.h> #include <trace/events/timer.h> #include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h> /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + static inline cputime_t prof_ticks(struct task_struct *p) { cputime_t utime, stime; @@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return false; + + if (tsk->signal->cputimer.running) + return false; + + return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } + if (!ret) + posix_cpu_timer_kick_nohz(); return ret; } @@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + + /* + * In case some timers were rescheduled after the queue got emptied, + * wake up full dynticks CPUs. + */ + if (tsk->signal->cputimer.running) + posix_cpu_timer_kick_nohz(); } /* @@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - return; + goto out; *newval += now.cpu; } @@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } +out: + posix_cpu_timer_kick_nohz(); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/printk.c b/kernel/printk.c index 96dcfcd9a2d4..fa36e1494420 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> +#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 17ae54da0ec2..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8534308fd05..16ea67925015 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -799,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->offline_fqs++; return 1; } + + /* + * There is a possibility that a CPU in adaptive-ticks state + * might run in the kernel with the scheduling-clock tick disabled + * for an extended time period. Invoke rcu_kick_nohz_cpu() to + * force the CPU to restart the scheduling-clock tick in this + * CPU is in this state. + */ + rcu_kick_nohz_cpu(rdp->cpu); + return 0; } @@ -1820,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (is_nocb_cpu(rdp->cpu)) + if (rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -2892,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (is_nocb_cpu(cpu)) { + if (rcu_is_nocb_cpu(cpu)) { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14ee40795d6f..da77a8f57ff9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -530,13 +530,13 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d084ae3f281c..170814dc418f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,6 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> +#include <linux/tick.h> #define RCU_KTHREAD_PRIO 1 @@ -1705,7 +1706,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(cpu)) return; /* @@ -1747,7 +1748,7 @@ static void rcu_cleanup_after_idle(int cpu) struct rcu_data *rdp; struct rcu_state *rsp; - if (is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(cpu)) return; rcu_try_advance_all_cbs(); for_each_rcu_flavor(rsp) { @@ -2052,7 +2053,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } /* Is the specified CPU a no-CPUs CPU? */ -static bool is_nocb_cpu(int cpu) +bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) return cpumask_test_cpu(cpu, rcu_nocb_mask); @@ -2110,7 +2111,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { - if (!is_nocb_cpu(rdp->cpu)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) @@ -2134,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, long qll = rsp->qlen_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ - if (!is_nocb_cpu(smp_processor_id())) + if (!rcu_is_nocb_cpu(smp_processor_id())) return 0; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2306,11 +2307,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } -static bool is_nocb_cpu(int cpu) -{ - return false; -} - static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { @@ -2337,3 +2333,20 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off. RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled. Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(cpu)) + smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 49099e81c87b..cf6c17412932 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -95,7 +95,7 @@ static const struct file_operations rcubarrier_fops = { .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -206,7 +206,7 @@ static const struct file_operations rcuexp_fops = { .open = rcuexp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -306,7 +306,7 @@ static const struct file_operations rcuhier_fops = { .open = rcuhier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) @@ -348,7 +348,7 @@ static const struct file_operations rcugp_fops = { .open = rcugp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) diff --git a/kernel/relay.c b/kernel/relay.c index eef0d113b79e..b91488ba2e5a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf) static void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - buf->chan->cb->remove_buf_file(buf->dentry); relay_destroy_buf(buf); } @@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; del_timer_sync(&buf->timer); + buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3fcd847..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5662f58f0b69..58453b8272fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -544,7 +544,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -582,7 +582,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -612,20 +612,56 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ void sched_avg_update(struct rq *rq) { @@ -1357,7 +1393,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1374,6 +1411,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1855,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP @@ -2118,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2344,12 +2384,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2509,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2569,7 +2609,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -2696,7 +2736,34 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); +} + +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; } +#endif notrace unsigned long get_parent_ip(unsigned long addr) { @@ -6951,9 +7018,12 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bf7081b1ec5..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5355,7 +5355,7 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5572,9 +5572,9 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5717,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif @@ -6187,7 +6187,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b8ce77328341..d8da01008d39 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) { idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void post_schedule_idle(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c225c4c7111..ce39224d6155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/tick.h> #include "cpupri.h" #include "cpuacct.h" @@ -405,10 +406,13 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif int skip_clock_update; /* capture load from *all* tasks on this cpu: */ @@ -1072,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal) static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL + if (rq->nr_running == 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } +#endif } static inline void dec_nr_running(struct rq *rq) @@ -1079,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq) rq->nr_running--; } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1299,7 +1320,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, diff --git a/kernel/softirq.c b/kernel/softirq.c index aa82723c7202..b5197dcb0dad 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -329,6 +329,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +static inline void tick_irq_exit(void) +{ +#ifdef CONFIG_NO_HZ_COMMON + int cpu = smp_processor_id(); + + /* Make sure that timer wheel updates are propagated */ + if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { + if (!in_interrupt()) + tick_nohz_irq_exit(); + } +#endif +} + /* * Exit an interrupt context. Process softirqs if needed and possible: */ @@ -346,11 +359,7 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_irq_exit(); -#endif + tick_irq_exit(); rcu_irq_exit(); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bfd6787b355a..7078052284fd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -200,6 +200,7 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); +cond_syscall(compat_sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ebf72358e86a..aea4a9ea6fc8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -15,6 +15,7 @@ #include <linux/netdevice.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/compat.h> #ifdef CONFIG_SYSCTL_SYSCALL @@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) #ifdef CONFIG_COMPAT -#include <asm/compat.h> struct compat_sysctl_args { compat_uptr_t name; @@ -1459,7 +1459,7 @@ struct compat_sysctl_args { compat_ulong_t __unused[4]; }; -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) { struct compat_sysctl_args tmp; compat_size_t __user *compat_oldlenp; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..e4c07b0692bb 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE if GENERIC_CLOCKEVENTS menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independ of this. config TICK_ONESHOT bool -config NO_HZ - bool "Tickless System (Dynamic Ticks)" +config NO_HZ_COMMON + bool depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping + depends on SMP + # RCU_USER_QS dependency + depends on HAVE_CONTEXT_TRACKING + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on 64BIT + select NO_HZ_COMMON + select RCU_USER_QS + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select CONTEXT_TRACKING_FORCE + select IRQ_WORK + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + +endchoice + +config NO_HZ_FULL_ALL + bool "Full dynticks system on all CPUs by default" + depends on NO_HZ_FULL + help + If the user doesn't pass the nohz_full boot option to + define the range of full dynticks CPUs, consider that all + CPUs in the system are full dynticks by default. + Note the boot CPU will still be kept outside the range to + handle the timekeeping duty. + +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS help - This option enables a tickless system: timer interrupts will - only trigger on an as-needed basis both when the system is - busy and when the system is idle. + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. config HIGH_RES_TIMERS bool "High Resolution Timer Support" diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2f..206bbfb34e09 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -693,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; /* Take the do_timer update */ - tick_do_timer_cpu = cpu; + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; /* * We must be careful here. There might be other CPUs diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 6176a3e45709..5d3fb100bc06 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf19095..bc67d4245e1d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -21,11 +21,15 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> #include <asm/irq_regs.h> #include "tick-internal.h" +#include <trace/events/timer.h> + /* * Per cpu nohz control structure */ @@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now) { int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went @@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now) * this duty, then the jiffies update is still serialized by * jiffies_lock. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; #endif @@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) profile_tick(CPU_PROFILING); } +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; + +static bool can_stop_full_tick(void) +{ + WARN_ON_ONCE(!irqs_disabled()); + + if (!sched_can_stop_tick()) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return false; + } + + if (!posix_cpu_timers_can_stop_tick(current)) { + trace_tick_stop(0, "posix timers running\n"); + return false; + } + + if (!perf_event_can_stop_tick()) { + trace_tick_stop(0, "perf events running\n"); + return false; + } + + /* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + /* + * TODO: kick full dynticks CPUs when + * sched_clock_stable is set. + */ + if (!sched_clock_stable) { + trace_tick_stop(0, "unstable sched clock\n"); + return false; + } +#endif + + return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void tick_nohz_full_check(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (tick_nohz_full_cpu(smp_processor_id())) { + if (ts->tick_stopped && !is_idle_task(current)) { + if (!can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); + } + } +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ + tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ + if (tick_nohz_full_cpu(smp_processor_id())) + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ + tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ + if (!have_nohz_full_mask) + return; + + preempt_disable(); + smp_call_function_many(nohz_full_mask, + nohz_full_kick_ipi, NULL, false); + preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + +out: + local_irq_restore(flags); +} + +int tick_nohz_full_cpu(int cpu) +{ + if (!have_nohz_full_mask) + return 0; + + return cpumask_test_cpu(cpu, nohz_full_mask); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ + int cpu; + + alloc_bootmem_cpumask_var(&nohz_full_mask); + if (cpulist_parse(str, nohz_full_mask) < 0) { + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + return 1; + } + + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + have_nohz_full_mask = true; + + return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* + * If we handle the timekeeping duty for full dynticks CPUs, + * we can't safely shutdown that CPU. + */ + if (have_nohz_full_mask && tick_do_timer_cpu == cpu) + return -EINVAL; + break; + } + return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ + int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL + if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); + return err; + } + err = 0; + cpumask_setall(nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); + have_nohz_full_mask = true; +#endif + return err; +} + +void __init tick_nohz_init(void) +{ + int cpu; + + if (!have_nohz_full_mask) { + if (tick_nohz_init_all() < 0) + return; + } + + cpu_notifier(tick_nohz_cpu_down_callback, 0); + + /* Make sure full dynticks CPU are also RCU nocbs */ + for_each_cpu(cpu, nohz_full_mask) { + if (!rcu_is_nocb_cpu(cpu)) { + pr_warning("NO_HZ: CPU %d is not RCU nocb: " + "cleared from nohz_full range", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + } + + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#else +#define have_nohz_full_mask (0) +#endif + /* * NOHZ - aka dynamic tick functionality */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ @@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta_jiffies = rcu_delta_jiffies; } } + /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu + * Do not stop the tick, if we are only one off (or less) + * or if the cpu is required for RCU: */ - if (!ts->tick_stopped && delta_jiffies == 1) + if (!ts->tick_stopped && delta_jiffies <= 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ @@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = KTIME_MAX; } +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; + trace_tick_stop(1, " "); } /* @@ -457,6 +687,24 @@ out: return ret; } +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!can_stop_full_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* @@ -489,6 +737,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } + if (have_nohz_full_mask) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + return true; } @@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (!ts->inidle) - return; - - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); - __tick_nohz_idle_enter(ts); + if (ts->inidle) { + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); + __tick_nohz_idle_enter(ts); + } else { + tick_nohz_full_stop_tick(ts); + } } /** @@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() @@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void) now = ktime_get(); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce9771..a860bba34412 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -739,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) cpu = get_nohz_timer_target(); #endif @@ -931,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu) debug_activate(timer, timer->expires); internal_add_timer(base, timer); /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. */ - wake_up_idle_cpu(cpu); + wake_up_nohz_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1189,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a CPU is idle. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5e9efd4b83a4..015f85aaca08 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -71,6 +71,7 @@ config TRACE_CLOCK config RING_BUFFER bool select TRACE_CLOCK + select IRQ_WORK config FTRACE_NMI_ENTER bool @@ -107,7 +108,6 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select IRQ_WORK config GENERIC_TRACER bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ed58a3216a6d..b8b8560bfb95 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } +EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8a5c017bb50c..b549b0f5b977 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,6 +64,13 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_REGEX_LOCK(opsname) \ + .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#else +#define INIT_REGEX_LOCK(opsname) +#endif + static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, @@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); while (likely(op = rcu_dereference_raw((op)->next)) && \ unlikely((op) != &ftrace_list_end)) +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { + mutex_init(&ops->regex_lock); + ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif +} + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -907,7 +924,8 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1103,11 +1121,10 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; -static DEFINE_MUTEX(ftrace_regex_lock); - struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -1247,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { + ftrace_ops_init(ops); free_ftrace_hash(ops->filter_hash); free_ftrace_hash(ops->notrace_hash); } @@ -2441,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && - !(rec->flags & ~FTRACE_FL_MASK))) { + !(rec->flags & FTRACE_FL_ENABLED))) { rec = NULL; goto retry; @@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; int ret = 0; + ftrace_ops_init(ops); + if (unlikely(ftrace_disabled)) return -ENODEV; @@ -2636,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, return -ENOMEM; } + iter->ops = ops; + iter->flags = flag; + + mutex_lock(&ops->regex_lock); + if (flag & FTRACE_ITER_NOTRACE) hash = ops->notrace_hash; else hash = ops->filter_hash; - iter->ops = ops; - iter->flags = flag; - if (file->f_mode & FMODE_WRITE) { - mutex_lock(&ftrace_lock); iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); - mutex_unlock(&ftrace_lock); - if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); - return -ENOMEM; + ret = -ENOMEM; + goto out_unlock; } } - mutex_lock(&ftrace_regex_lock); - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_filter_reset(iter->hash); @@ -2677,7 +2695,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } else file->private_data = iter; - mutex_unlock(&ftrace_regex_lock); + + out_unlock: + mutex_unlock(&ops->regex_lock); return ret; } @@ -2910,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(trace_probe_ops) }; static int ftrace_probe_registered; @@ -2919,8 +2941,12 @@ static void __enable_ftrace_function_probe(void) int ret; int i; - if (ftrace_probe_registered) + if (ftrace_probe_registered) { + /* still need to update the function call sites */ + if (ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); return; + } for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -2990,19 +3016,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { count = -ENOMEM; - goto out_unlock; + goto out; } if (unlikely(ftrace_disabled)) { count = -ENODEV; - goto out_unlock; + goto out; } + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { if (!ftrace_match_record(rec, NULL, search, len, type)) @@ -3056,6 +3084,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, out_unlock: mutex_unlock(&ftrace_lock); + out: + mutex_unlock(&trace_probe_ops.regex_lock); free_ftrace_hash(hash); return count; @@ -3095,7 +3125,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3133,6 +3163,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, list_add(&entry->free_list, &free_list); } } + mutex_lock(&ftrace_lock); __disable_ftrace_function_probe(); /* * Remove after the disable is called. Otherwise, if the last @@ -3144,9 +3175,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, list_del(&entry->free_list); ftrace_free_entry(entry); } + mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&ftrace_lock); + mutex_unlock(&trace_probe_ops.regex_lock); free_ftrace_hash(hash); } @@ -3256,18 +3288,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_regex_lock); - - ret = -ENODEV; - if (unlikely(ftrace_disabled)) - goto out_unlock; - if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; } else iter = file->private_data; + if (unlikely(ftrace_disabled)) + return -ENODEV; + + /* iter->hash is a local copy, so we don't need regex_lock */ + parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); @@ -3276,14 +3307,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); - if (ret) - goto out_unlock; + if (ret < 0) + goto out; } ret = read; -out_unlock: - mutex_unlock(&ftrace_regex_lock); - + out: return ret; } @@ -3335,16 +3364,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ops->regex_lock); + if (enable) orig_hash = &ops->filter_hash; else orig_hash = &ops->notrace_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - return -ENOMEM; + if (!hash) { + ret = -ENOMEM; + goto out_regex_unlock; + } - mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { @@ -3366,7 +3398,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ops->regex_lock); free_ftrace_hash(hash); return ret; @@ -3392,6 +3424,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { + ftrace_ops_init(ops); return ftrace_set_addr(ops, ip, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); @@ -3416,6 +3449,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { + ftrace_ops_init(ops); return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3434,6 +3468,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { + ftrace_ops_init(ops); return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); @@ -3524,6 +3559,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; + ftrace_ops_init(ops); + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -3551,10 +3588,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) int filter_hash; int ret; - mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; - seq_release(inode, file); } else iter = file->private_data; @@ -3567,6 +3602,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); + mutex_lock(&iter->ops->regex_lock); + if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); @@ -3584,10 +3621,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } + + mutex_unlock(&iter->ops->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); - mutex_unlock(&ftrace_regex_lock); return 0; } @@ -4126,7 +4164,8 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4180,8 +4219,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, } static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(control_ops) }; static inline void @@ -4539,6 +4579,8 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret = -1; + ftrace_ops_init(ops); + mutex_lock(&ftrace_lock); ret = __register_ftrace_function(ops); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53582e982e51..7a0cf68027cc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -251,7 +251,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, switch (enable) { case 0: /* - * When soft_disable is set and enable is cleared, we want + * When soft_disable is set and enable is cleared, the sm_ref + * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED @@ -263,6 +264,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ if (soft_disable) { + if (atomic_dec_return(&file->sm_ref) > 0) + break; disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); } else @@ -291,8 +294,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, */ if (!soft_disable) clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); - else + else { + if (atomic_inc_return(&file->sm_ref) > 1) + break; set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { @@ -623,6 +629,8 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (file->flags & FTRACE_EVENT_FL_ENABLED) { if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) buf = "0*\n"; + else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + buf = "1*\n"; else buf = "1\n"; } else @@ -1521,6 +1529,24 @@ __register_event(struct ftrace_event_call *call, struct module *mod) return 0; } +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0); + list_add(&file->list, &tr->events); + + return file; +} + /* Add an event to a trace directory */ static int __trace_add_new_event(struct ftrace_event_call *call, @@ -1532,14 +1558,10 @@ __trace_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kmem_cache_alloc(file_cachep, GFP_TRACE); + file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - file->event_call = call; - file->tr = tr; - list_add(&file->list, &tr->events); - return event_create_dir(tr->event_dir, file, id, enable, filter, format); } @@ -1554,14 +1576,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kmem_cache_alloc(file_cachep, GFP_TRACE); + file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - file->event_call = call; - file->tr = tr; - list_add(&file->list, &tr->events); - return 0; } @@ -2061,8 +2079,18 @@ event_enable_func(struct ftrace_hash *hash, if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, ops, data); - if (!ret) + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; out: mutex_unlock(&event_mutex); return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1865d5f76538..636d45fe69b3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,7 +27,6 @@ /** * Kprobe event core functions */ - struct trace_probe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ @@ -36,6 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; + struct ftrace_event_file **files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -46,7 +46,7 @@ struct trace_probe { (sizeof(struct probe_arg) * (n))) -static __kprobes int trace_probe_is_return(struct trace_probe *tp) +static __kprobes bool trace_probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; } @@ -183,12 +183,57 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } -/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static int enable_trace_probe(struct trace_probe *tp, int flag) +static int trace_probe_nr_files(struct trace_probe *tp) +{ + struct ftrace_event_file **file = tp->files; + int ret = 0; + + if (file) + while (*(file++)) + ret++; + + return ret; +} + +static DEFINE_MUTEX(probe_enable_lock); + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - tp->flags |= flag; + mutex_lock(&probe_enable_lock); + + if (file) { + struct ftrace_event_file **new, **old = tp->files; + int n = trace_probe_nr_files(tp); + + /* 1 is for new one and 1 is for stopper */ + new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto out_unlock; + } + memcpy(new, old, n * sizeof(struct ftrace_event_file *)); + new[n] = file; + /* The last one keeps a NULL */ + + rcu_assign_pointer(tp->files, new); + tp->flags |= TP_FLAG_TRACE; + + if (old) { + /* Make sure the probe is done with old files */ + synchronize_sched(); + kfree(old); + } + } else + tp->flags |= TP_FLAG_PROFILE; + if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) @@ -197,19 +242,83 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) ret = enable_kprobe(&tp->rp.kp); } + out_unlock: + mutex_unlock(&probe_enable_lock); + return ret; } -/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static void disable_trace_probe(struct trace_probe *tp, int flag) +static int +trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +{ + int i; + + if (tp->files) { + for (i = 0; tp->files[i]; i++) + if (tp->files[i] == file) + return i; + } + + return -1; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { - tp->flags &= ~flag; + int ret = 0; + + mutex_lock(&probe_enable_lock); + + if (file) { + struct ftrace_event_file **new, **old = tp->files; + int n = trace_probe_nr_files(tp); + int i, j; + + if (n == 0 || trace_probe_file_index(tp, file) < 0) { + ret = -EINVAL; + goto out_unlock; + } + + if (n == 1) { /* Remove the last file */ + tp->flags &= ~TP_FLAG_TRACE; + new = NULL; + } else { + new = kzalloc(n * sizeof(struct ftrace_event_file *), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto out_unlock; + } + + /* This copy & check loop copies the NULL stopper too */ + for (i = 0, j = 0; j < n && i < n + 1; i++) + if (old[i] != file) + new[j++] = old[i]; + } + + rcu_assign_pointer(tp->files, new); + + /* Make sure the probe is done with old files */ + synchronize_sched(); + kfree(old); + } else + tp->flags &= ~TP_FLAG_PROFILE; + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); } + + out_unlock: + mutex_unlock(&probe_enable_lock); + + return ret; } /* Internal register function - just handle k*probes and flags */ @@ -723,9 +832,10 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, } /* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static __kprobes void +__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -733,7 +843,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; - tp->nhit++; + WARN_ON(call != ftrace_file->event_call); + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + return; local_save_flags(irq_flags); pc = preempt_count(); @@ -741,13 +854,14 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) dsize = __get_data_size(tp, regs); size = sizeof(*entry) + tp->size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)kp->addr; + entry->ip = (unsigned long)tp->rp.kp.addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) @@ -755,11 +869,24 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) irq_flags, pc, regs); } +static __kprobes void +kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +{ + struct ftrace_event_file **file = tp->files; + + /* Note: preempt is already disabled around the kprobe handler */ + while (*file) { + __kprobe_trace_func(tp, regs, *file); + file++; + } +} + /* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static __kprobes void +__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -767,14 +894,20 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; + WARN_ON(call != ftrace_file->event_call); + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + return; + local_save_flags(irq_flags); pc = preempt_count(); dsize = __get_data_size(tp, regs); size = sizeof(*entry) + tp->size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; @@ -788,6 +921,19 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, irq_flags, pc, regs); } +static __kprobes void +kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct ftrace_event_file **file = tp->files; + + /* Note: preempt is already disabled around the kprobe handler */ + while (*file) { + __kretprobe_trace_func(tp, ri, regs, *file); + file++; + } +} + /* Event entry printers */ enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, @@ -975,10 +1121,9 @@ static int set_print_fmt(struct trace_probe *tp) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, - struct pt_regs *regs) +static __kprobes void +kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; @@ -997,7 +1142,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->ip = (unsigned long)kp->addr; + entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); @@ -1007,10 +1152,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, } /* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static __kprobes void +kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1044,20 +1189,19 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, TP_FLAG_TRACE); + return enable_trace_probe(tp, file); case TRACE_REG_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_TRACE); - return 0; + return disable_trace_probe(tp, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, TP_FLAG_PROFILE); + return enable_trace_probe(tp, NULL); case TRACE_REG_PERF_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_PROFILE); - return 0; + return disable_trace_probe(tp, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1073,11 +1217,13 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + tp->nhit++; + if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(kp, regs); + kprobe_trace_func(tp, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(kp, regs); + kprobe_perf_func(tp, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1087,11 +1233,13 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + tp->nhit++; + if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(ri, regs); + kretprobe_trace_func(tp, ri, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(ri, regs); + kretprobe_perf_func(tp, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1189,11 +1337,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, return a1 + a2 + a3 + a4 + a5 + a6; } +static struct ftrace_event_file * +find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) + if (file->event_call == &tp->call) + return file; + + return NULL; +} + static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); struct trace_probe *tp; + struct ftrace_event_file *file; target = kprobe_trace_selftest_target; @@ -1203,31 +1364,43 @@ static __init int kprobe_trace_self_tests_init(void) "$stack $stack0 +0($stack)", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function entry.\n"); + pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + pr_warn("error on getting new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_probe(tp, file); + } } ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " "$retval", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function return.\n"); + pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + pr_warn("error on getting 2nd new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_probe(tp, file); + } } if (warn) @@ -1238,27 +1411,39 @@ static __init int kprobe_trace_self_tests_init(void) /* Disable trace points before removing it */ tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting test probe.\n"); + pr_warn("error on getting test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_probe(tp, file); + } tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting 2nd test probe.\n"); + pr_warn("error on getting 2nd test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_probe(tp, file); + } ret = traceprobe_command("-:testprobe", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } ret = traceprobe_command("-:testprobe2", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..1ae602809efb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void) * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * + * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. + * Note that both per-cpu and unbound workqueues may be associated with + * multiple pool_workqueues which have separate congested states. A + * workqueue being congested on one CPU doesn't mean the workqueue is also + * contested on other CPUs / NUMA nodes. + * * RETURNS: * %true if congested, %false otherwise. */ @@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) rcu_read_lock_sched(); + if (cpu == WORK_CPU_UNBOUND) + cpu = smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else |