diff options
Diffstat (limited to 'kernel')
134 files changed, 11155 insertions, 6708 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 2251882daf53..44511d100eaa 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE bool +config UNINLINE_SPIN_UNLOCK + bool + # # lock_* functions are inlined when: # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y @@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # +if !DEBUG_SPINLOCK + config INLINE_SPIN_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK config INLINE_SPIN_TRYLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK_BH config INLINE_SPIN_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK config INLINE_SPIN_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH config INLINE_SPIN_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ config INLINE_SPIN_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQSAVE - -config UNINLINE_SPIN_UNLOCK - bool + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE config INLINE_SPIN_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE config INLINE_READ_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + def_bool y + depends on ARCH_INLINE_READ_TRYLOCK config INLINE_READ_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK config INLINE_READ_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH config INLINE_READ_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ config INLINE_READ_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE config INLINE_WRITE_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + def_bool y + depends on ARCH_INLINE_WRITE_TRYLOCK config INLINE_WRITE_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK config INLINE_WRITE_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH config INLINE_WRITE_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ config INLINE_WRITE_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +endif config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES + def_bool y + depends on SMP && !DEBUG_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..6c072b6da239 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o lglock.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif @@ -55,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -98,7 +98,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ -obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o @@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o $(obj)/configs.o: $(obj)/config_data.h @@ -131,3 +132,81 @@ quiet_cmd_timeconst = TIMEC $@ targets += timeconst.h $(obj)/timeconst.h: $(src)/timeconst.pl FORCE $(call if_changed,timeconst) + +ifeq ($(CONFIG_MODULE_SIG),y) +# +# Pull the signing certificate and any extra certificates into the kernel +# + +quiet_cmd_touch = TOUCH $@ + cmd_touch = touch $@ + +extra_certificates: + $(call cmd,touch) + +kernel/modsign_certificate.o: signing_key.x509 extra_certificates + +############################################################################### +# +# If module signing is requested, say by allyesconfig, but a key has not been +# supplied, then one will need to be generated to make sure the build does not +# fail and that the kernel may be used afterwards. +# +############################################################################### +sign_key_with_hash := +ifeq ($(CONFIG_MODULE_SIG_SHA1),y) +sign_key_with_hash := -sha1 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA224),y) +sign_key_with_hash := -sha224 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA256),y) +sign_key_with_hash := -sha256 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA384),y) +sign_key_with_hash := -sha384 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA512),y) +sign_key_with_hash := -sha512 +endif +ifeq ($(sign_key_with_hash),) +$(error Could not determine digest type to use from kernel config) +endif + +signing_key.priv signing_key.x509: x509.genkey + @echo "###" + @echo "### Now generating an X.509 key pair to be used for signing modules." + @echo "###" + @echo "### If this takes a long time, you might wish to run rngd in the" + @echo "### background to keep the supply of entropy topped up. It" + @echo "### needs to be run as root, and uses a hardware random" + @echo "### number generator if one is available." + @echo "###" + openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ + -x509 -config x509.genkey \ + -outform DER -out signing_key.x509 \ + -keyout signing_key.priv + @echo "###" + @echo "### Key pair generated." + @echo "###" + +x509.genkey: + @echo Generating X.509 key generation config + @echo >x509.genkey "[ req ]" + @echo >>x509.genkey "default_bits = 4096" + @echo >>x509.genkey "distinguished_name = req_distinguished_name" + @echo >>x509.genkey "prompt = no" + @echo >>x509.genkey "string_mask = utf8only" + @echo >>x509.genkey "x509_extensions = myexts" + @echo >>x509.genkey + @echo >>x509.genkey "[ req_distinguished_name ]" + @echo >>x509.genkey "O = Magrathea" + @echo >>x509.genkey "CN = Glacier signing key" + @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" + @echo >>x509.genkey + @echo >>x509.genkey "[ myexts ]" + @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" + @echo >>x509.genkey "keyUsage=digitalSignature" + @echo >>x509.genkey "subjectKeyIdentifier=hash" + @echo >>x509.genkey "authorityKeyIdentifier=keyid" +endif diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b0..051e071a06e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, } } -static int acct_on(char *name) +static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(char *name) struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - char *tmp = getname(name); + struct filename *tmp = getname(name); if (IS_ERR(tmp)) return (PTR_ERR(tmp)); error = acct_on(tmp); @@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = orig_cred->uid; - ac.ac_gid = orig_cred->gid; + ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); + ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..40414e9143db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -61,6 +61,7 @@ #include <linux/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> +#include <linux/pid_namespace.h> #include "audit.h" @@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. + * contains the pid of the auditd process and audit_nlk_portid contains + * the portid to use to send netlink messages to that process. */ int audit_pid; -static int audit_nlk_pid; +static int audit_nlk_portid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ; static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ -uid_t audit_sig_uid = -1; +kuid_t audit_sig_uid = INVALID_UID; pid_t audit_sig_pid = -1; u32 audit_sig_sid = 0; @@ -264,7 +265,7 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, + kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); + old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, + int new, kuid_t loginuid, u32 sessionid, u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb) int err; /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy) return 0; } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ - struct task_struct *tsk; - int err; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(tsk); - rcu_read_unlock(); - err = tty_audit_push_task(tsk, loginuid, sessionid); - put_task_struct(tsk); - return err; -} - int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; @@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; + /* Only support the initial namespaces for now. */ + if ((current_user_ns() != &init_user_ns) || + (task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + switch (msg_type) { case AUDIT_GET: case AUDIT_LIST: @@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 ses, - u32 sid) + kuid_t auid, u32 ses, u32 sid) { int rc = 0; char *ctx = NULL; @@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - pid, uid, auid, ses); + task_tgid_vnr(current), + from_kuid(&init_user_ns, current_uid()), + from_kuid(&init_user_ns, auid), ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 uid, pid, seq, sid; + u32 seq, sid; void *data; struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ + kuid_t loginuid; /* loginuid of sender */ u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; @@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); @@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; case AUDIT_SET: @@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid, sid, 1); audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; + audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, @@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb)); + err = audit_filter_user(); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid, + err = tty_audit_push_task(current, loginuid, sessionid); if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, pid, uid, + audit_log_common_recv_msg(&ab, msg_type, loginuid, sessionid, sid); if (msg_type != AUDIT_USER_TTY) @@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_pid(ab, pid); + audit_set_pid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); } break; @@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_ADD_RULE: @@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); @@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); return -ENOMEM; } - sig_data->uid = audit_sig_uid; + sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk; - unsigned long flags; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - s.enabled = tsk->signal->audit_tty != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); - - if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + s.enabled = tsk->signal->audit_tty != 0; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_send_reply(NETLINK_CB(skb).portid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; - struct task_struct *tsk; - unsigned long flags; + struct task_struct *tsk = current; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - tsk->signal->audit_tty = s->enabled != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); + + spin_lock_irq(&tsk->sighand->siglock); + tsk->signal->audit_tty = s->enabled != 0; + spin_unlock_irq(&tsk->sighand->siglock); break; } default: @@ -971,8 +946,7 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, - THIS_MODULE, &cfg); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else @@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); + if (!ab) + return; audit_log_format(ab, "op=%s action=denied", operation); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 816766803371..d51cba868e1b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino) return (ino & (AUDIT_INODE_BUCKETS-1)); } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); -extern int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen); +extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); +extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); +extern int parent_len(const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, const void *payload, int size); @@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *); extern char *audit_unpack_string(void **, size_t *, size_t); extern pid_t audit_sig_pid; -extern uid_t audit_sig_uid; +extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index a66affc1c12c..4a599f699adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); audit_log_string(ab, op); audit_log_format(ab, " path="); @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent, /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) + if (audit_compare_dname_path(dname, owatch->path, + AUDIT_NAME_FULL)) continue; /* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd206..7f19f23d38a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; err = -EINVAL; if (f->op == Audit_bad) @@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) switch(f->type) { default: goto exit_free; - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; switch(f->type) { - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + case AUDIT_OBJ_GID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: - case AUDIT_OBJ_UID: - case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) + return 1; + break; + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, +static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, char *action, struct audit_krule *rule, int res) { @@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", + from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, + size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right) } } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) +int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) { - int dlen, plen; - const char *p; + switch (op) { + case Audit_equal: + return uid_eq(left, right); + case Audit_not_equal: + return !uid_eq(left, right); + case Audit_lt: + return uid_lt(left, right); + case Audit_le: + return uid_lte(left, right); + case Audit_gt: + return uid_gt(left, right); + case Audit_ge: + return uid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} - if (!dname || !path) - return 1; +int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) +{ + switch (op) { + case Audit_equal: + return gid_eq(left, right); + case Audit_not_equal: + return !gid_eq(left, right); + case Audit_lt: + return gid_lt(left, right); + case Audit_le: + return gid_lte(left, right); + case Audit_gt: + return gid_gt(left, right); + case Audit_ge: + return gid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} + +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path) +{ + int plen; + const char *p; - dlen = strlen(dname); plen = strlen(path); - if (plen < dlen) - return 1; + + if (plen == 0) + return plen; /* disregard trailing slashes */ p = path + plen - 1; while ((*p == '/') && (p > path)) p--; - /* find last path component */ - p = p - dlen + 1; - if (p < path) + /* walk backward until we find the next slash or hit beginning */ + while ((*p != '/') && (p > path)) + p--; + + /* did we find a slash? Then increment to include it in path */ + if (*p == '/') + p++; + + return p - path; +} + +/** + * audit_compare_dname_path - compare given dentry name with last component in + * given path. Return of 0 indicates a match. + * @dname: dentry name that we're comparing + * @path: full pathname that we're comparing + * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL + * here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +{ + int dlen, pathlen; + const char *p; + + dlen = strlen(dname); + pathlen = strlen(path); + if (pathlen < dlen) return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; + parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; + if (pathlen - parentlen != dlen) + return 1; + + p = path + parentlen; + return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, enum audit_state *state) { int i; @@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, switch (f->type) { case AUDIT_PID: - result = audit_comparator(cb->creds.pid, f->op, f->val); + result = audit_comparator(task_pid_vnr(current), f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(cb->creds.uid, f->op, f->val); + result = audit_uid_comparator(current_uid(), f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(cb->creds.gid, f->op, f->val); + result = audit_gid_comparator(current_gid(), f->op, f->gid); break; case AUDIT_LOGINUID: - result = audit_comparator(audit_get_loginuid(current), - f->op, f->val); + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb) +int audit_filter_user(void) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..e37e6a12c5e3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@ * a name dynamically and also add those to the list anchored by names_list. */ #define AUDIT_NAMES 5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -106,27 +103,29 @@ struct audit_cap_data { * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ struct audit_names { - struct list_head list; /* audit_context->names_list */ - const char *name; - unsigned long ino; - dev_t dev; - umode_t mode; - uid_t uid; - gid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - bool name_put; /* call __putname() for this name */ + struct list_head list; /* audit_context->names_list */ + struct filename *name; + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + unsigned char type; /* record type */ + bool name_put; /* call __putname() for this name */ /* * This was an allocated audit_names and not from the array of * names allocated in the task audit context. Thus this name * should be freed on syscall exit */ - bool should_free; + bool should_free; }; struct audit_aux_data { @@ -149,8 +148,8 @@ struct audit_aux_data_execve { struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; - uid_t target_uid[AUDIT_AUX_PIDS]; + kuid_t target_auid[AUDIT_AUX_PIDS]; + kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; u32 target_sid[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; @@ -201,21 +200,20 @@ struct audit_context { struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; - struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; struct sockaddr_storage *sockaddr; size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; - uid_t uid, euid, suid, fsuid; - gid_t gid, egid, sgid, fsgid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; unsigned long personality; int arch; pid_t target_pid; - uid_t target_auid; - uid_t target_uid; + kuid_t target_auid; + kuid_t target_uid; unsigned int target_sessionid; u32 target_sid; char target_comm[TASK_COMM_LEN]; @@ -231,8 +229,8 @@ struct audit_context { long args[6]; } socketcall; struct { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; u32 osid; int has_perm; @@ -464,37 +462,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } -static int audit_compare_id(uid_t uid1, - struct audit_names *name, - unsigned long name_offset, - struct audit_field *f, - struct audit_context *ctx) +static int audit_compare_uid(kuid_t uid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) { struct audit_names *n; - unsigned long addr; - uid_t uid2; int rc; - - BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - + if (name) { - addr = (unsigned long)name; - addr += name_offset; - - uid2 = *(uid_t *)addr; - rc = audit_comparator(uid1, f->op, uid2); + rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } - + if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - addr = (unsigned long)n; - addr += name_offset; - - uid2 = *(uid_t *)addr; + rc = audit_uid_comparator(uid, f->op, n->uid); + if (rc) + return rc; + } + } + return 0; +} - rc = audit_comparator(uid1, f->op, uid2); +static int audit_compare_gid(kgid_t gid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + int rc; + + if (name) { + rc = audit_gid_comparator(gid, f->op, name->gid); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } @@ -511,80 +519,62 @@ static int audit_field_compare(struct task_struct *tsk, switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: - return audit_compare_id(cred->uid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: - return audit_compare_id(cred->gid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: - return audit_compare_id(cred->euid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: - return audit_compare_id(cred->egid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_id(tsk->loginuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(tsk->loginuid, name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: - return audit_compare_id(cred->suid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: - return audit_compare_id(cred->sgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: - return audit_compare_id(cred->fsuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: - return audit_compare_id(cred->fsgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: - return audit_comparator(cred->uid, f->op, tsk->loginuid); + return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); case AUDIT_COMPARE_UID_TO_EUID: - return audit_comparator(cred->uid, f->op, cred->euid); + return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: - return audit_comparator(cred->uid, f->op, cred->suid); + return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: - return audit_comparator(cred->uid, f->op, cred->fsuid); + return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: - return audit_comparator(tsk->loginuid, f->op, cred->euid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: - return audit_comparator(tsk->loginuid, f->op, cred->suid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: - return audit_comparator(cred->euid, f->op, cred->suid); + return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: - return audit_comparator(cred->euid, f->op, cred->fsuid); + return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: - return audit_comparator(cred->suid, f->op, cred->fsuid); + return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: - return audit_comparator(cred->gid, f->op, cred->egid); + return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: - return audit_comparator(cred->gid, f->op, cred->sgid); + return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: - return audit_comparator(cred->gid, f->op, cred->fsgid); + return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: - return audit_comparator(cred->egid, f->op, cred->sgid); + return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: - return audit_comparator(cred->egid, f->op, cred->fsgid); + return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: - return audit_comparator(cred->sgid, f->op, cred->fsgid); + return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; @@ -630,28 +620,28 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_UID: - result = audit_comparator(cred->uid, f->op, f->val); + result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: - result = audit_comparator(cred->euid, f->op, f->val); + result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: - result = audit_comparator(cred->suid, f->op, f->val); + result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: - result = audit_comparator(cred->fsuid, f->op, f->val); + result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(cred->gid, f->op, f->val); + result = audit_gid_comparator(cred->gid, f->op, f->gid); break; case AUDIT_EGID: - result = audit_comparator(cred->egid, f->op, f->val); + result = audit_gid_comparator(cred->egid, f->op, f->gid); break; case AUDIT_SGID: - result = audit_comparator(cred->sgid, f->op, f->val); + result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: - result = audit_comparator(cred->fsgid, f->op, f->val); + result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); @@ -717,10 +707,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_UID: if (name) { - result = audit_comparator(name->uid, f->op, f->val); + result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->uid, f->op, f->val)) { + if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } @@ -729,10 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_GID: if (name) { - result = audit_comparator(name->gid, f->op, f->val); + result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->gid, f->op, f->val)) { + if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } @@ -750,7 +740,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = audit_comparator(tsk->loginuid, f->op, f->val); + result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1006,7 +996,7 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count); list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } dump_stack(); return; @@ -1100,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { - struct audit_context *previous; - int count = 0; - - do { - previous = context->previous; - if (previous || (count && count < 10)) { - ++count; - printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" - " freeing multiple contexts (%d)\n", - context->serial, context->major, - context->name_count, count); - } - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - free_tree_refs(context); - audit_free_aux(context); - kfree(context->filterkey); - kfree(context->sockaddr); - kfree(context); - context = previous; - } while (context); - if (count >= 10) - printk(KERN_ERR "audit: freed %d contexts\n", count); + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + free_tree_refs(context); + audit_free_aux(context); + kfree(context->filterkey); + kfree(context->sockaddr); + kfree(context); } void audit_log_task_context(struct audit_buffer *ab) @@ -1154,13 +1128,43 @@ error_path: EXPORT_SYMBOL(audit_log_task_context); -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { + const struct cred *cred; char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; + char *tty; + + if (!ab) + return; /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + from_kuid(&init_user_ns, tsk->loginuid), + from_kuid(&init_user_ns, cred->uid), + from_kgid(&init_user_ns, cred->gid), + from_kuid(&init_user_ns, cred->euid), + from_kuid(&init_user_ns, cred->suid), + from_kuid(&init_user_ns, cred->fsuid), + from_kgid(&init_user_ns, cred->egid), + from_kgid(&init_user_ns, cred->sgid), + from_kgid(&init_user_ns, cred->fsgid), + tsk->sessionid, tty); get_task_comm(name, tsk); audit_log_format(ab, " comm="); @@ -1168,23 +1172,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if (mm) { down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, " exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); up_read(&mm->mmap_sem); } audit_log_task_context(ab); } +EXPORT_SYMBOL(audit_log_task_info); + static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, + kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) { struct audit_buffer *ab; @@ -1196,8 +1194,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, if (!ab) return rc; - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, - uid, sessionid); + audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, + from_kuid(&init_user_ns, auid), + from_kuid(&init_user_ns, uid), sessionid); if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; @@ -1447,7 +1446,9 @@ static void show_special(struct audit_context *context, int *call_panic) u32 osid = context->ipc.osid; audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", - context->ipc.uid, context->ipc.gid, context->ipc.mode); + from_kuid(&init_user_ns, context->ipc.uid), + from_kgid(&init_user_ns, context->ipc.gid), + context->ipc.mode); if (osid) { char *ctx = NULL; u32 len; @@ -1536,7 +1537,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); + audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the @@ -1546,7 +1547,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, + audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else @@ -1560,8 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, MAJOR(n->dev), MINOR(n->dev), n->mode, - n->uid, - n->gid, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); } @@ -1585,26 +1586,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - const struct cred *cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; - const char *tty; struct audit_names *n; /* tsk == current */ - context->pid = tsk->pid; - if (!context->ppid) - context->ppid = sys_getppid(); - cred = current_cred(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; - context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; - context->fsgid = cred->fsgid; context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -1619,32 +1606,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - audit_log_format(ab, - " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " ppid=%d pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - context->argv[0], - context->argv[1], - context->argv[2], - context->argv[3], - context->name_count, - context->ppid, - context->pid, - tsk->loginuid, - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, - tsk->sessionid); - + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count); audit_log_task_info(ab, tsk); audit_log_key(ab, context->filterkey); @@ -1798,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major, if (!context) return; - /* - * This happens only on certain architectures that make system - * calls in kernel_thread via the entry.S interface, instead of - * with direct calls. (If you are porting to a new - * architecture, hitting this condition can indicate that you - * got the _exit/_leave calls backward in entry.S.) - * - * i386 no - * x86_64 no - * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) - * - * This also happens with vm86 emulation in a non-nested manner - * (entries without exits), so this case must be caught. - */ - if (context->in_syscall) { - struct audit_context *newctx; - -#if AUDIT_DEBUG - printk(KERN_ERR - "audit(:%d) pid=%d in syscall=%d;" - " entering syscall=%d\n", - context->serial, tsk->pid, context->major, major); -#endif - newctx = audit_alloc_context(context->state); - if (newctx) { - newctx->previous = context; - context = newctx; - tsk->audit_context = newctx; - } else { - /* If we can't alloc a new context, the best we - * can do is to leak memory (any pending putname - * will be lost). The only other alternative is - * to abandon auditing. */ - audit_zero_context(context, context->state); - } - } BUG_ON(context->in_syscall || context->name_count); if (!audit_enabled) @@ -1896,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(&context->killed_trees); - if (context->previous) { - struct audit_context *new_context = context->previous; - context->previous = NULL; - audit_free_context(context); - tsk->audit_context = new_context; - } else { - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - audit_free_aux(context); - context->aux = NULL; - context->aux_pids = NULL; - context->target_pid = 0; - context->target_sid = 0; - context->sockaddr_len = 0; - context->type = 0; - context->fds[0] = -1; - if (context->state != AUDIT_RECORD_CONTEXT) { - kfree(context->filterkey); - context->filterkey = NULL; - } - tsk->audit_context = context; + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + audit_free_aux(context); + context->aux = NULL; + context->aux_pids = NULL; + context->target_pid = 0; + context->target_sid = 0; + context->sockaddr_len = 0; + context->type = 0; + context->fds[0] = -1; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; } + tsk->audit_context = context; } static inline void handle_one(const struct inode *inode) @@ -2009,7 +1934,8 @@ retry: #endif } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, + unsigned char type) { struct audit_names *aname; @@ -2024,6 +1950,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } aname->ino = (unsigned long)-1; + aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; @@ -2034,13 +1961,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } /** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ + struct audit_context *context = current->audit_context; + struct audit_names *n; + + list_for_each_entry(n, &context->names_list, list) { + if (!n->name) + continue; + if (n->name->uptr == uptr) + return n->name; + } + return NULL; +} + +/** * audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name) { struct audit_context *context = current->audit_context; struct audit_names *n; @@ -2054,13 +2004,19 @@ void __audit_getname(const char *name) return; } - n = audit_alloc_name(context); +#if AUDIT_DEBUG + /* The filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; n->name_put = true; + name->aname = n; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); @@ -2073,7 +2029,7 @@ void __audit_getname(const char *name) * then we delay the putname until syscall exit. * Called from include/linux/fs.h:putname(). */ -void audit_putname(const char *name) +void audit_putname(struct filename *name) { struct audit_context *context = current->audit_context; @@ -2088,7 +2044,7 @@ void audit_putname(const char *name) list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } #endif __putname(name); @@ -2102,8 +2058,8 @@ void audit_putname(const char *name) " put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); + context->in_syscall, name->name, + context->name_count, context->put_count); dump_stack(); } } @@ -2146,13 +2102,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent } /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent? */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(struct filename *name, const struct dentry *dentry, + unsigned int parent) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; @@ -2161,24 +2117,69 @@ void __audit_inode(const char *name, const struct dentry *dentry) if (!context->in_syscall) return; + if (!name) + goto out_alloc; + +#if AUDIT_DEBUG + /* The struct filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + /* + * If we have a pointer to an audit_names entry already, then we can + * just use it directly if the type is correct. + */ + n = name->aname; + if (n) { + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } + } + list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) - goto out; + /* does the name pointer match? */ + if (!n->name || n->name->name != name->name) + continue; + + /* match the correct record type */ + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } } - /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); +out_alloc: + /* unable to find the name from a previous getname(). Allocate a new + * anonymous entry. + */ + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; out: + if (parent) { + n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_PARENT; + } else { + n->name_len = AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_NORMAL; + } handle_path(dentry); audit_copy_inode(n, dentry, inode); } /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent + * @dentry: dentry being audited + * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -2188,15 +2189,14 @@ out: * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct dentry *dentry, - const struct inode *parent) +void __audit_inode_child(const struct inode *parent, + const struct dentry *dentry, + const unsigned char type) { struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; - struct audit_names *n; - int dirlen = 0; + struct audit_names *n, *found_parent = NULL, *found_child = NULL; if (!context->in_syscall) return; @@ -2204,62 +2204,65 @@ void __audit_inode_child(const struct dentry *dentry, if (inode) handle_one(inode); - /* parent is more likely, look for it first */ + /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + if (!n->name || n->type != AUDIT_TYPE_PARENT) continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ - found_parent = n->name; - goto add_names; + !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + found_parent = n; + break; } } - /* no matching parent, look for matching child */ + /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + /* can only match entries that have a name */ + if (!n->name || n->type != type) continue; - /* strcmp() is the more likely scenario */ - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { - if (inode) - audit_copy_inode(n, NULL, inode); - else - n->ino = (unsigned long)-1; - found_child = n->name; - goto add_names; + /* if we found a parent, make sure this one is a child of it */ + if (found_parent && (n->name != found_parent->name)) + continue; + + if (!strcmp(dname, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, + found_parent ? + found_parent->name_len : + AUDIT_NAME_FULL)) { + found_child = n; + break; } } -add_names: if (!found_parent) { - n = audit_alloc_name(context); + /* create a new, "anonymous" parent record */ + n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent); } if (!found_child) { - n = audit_alloc_name(context); - if (!n) + found_child = audit_alloc_name(context, type); + if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; + found_child->name = found_parent->name; + found_child->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - n->name_put = false; + found_child->name_put = false; } - - if (inode) - audit_copy_inode(n, NULL, inode); } + if (inode) + audit_copy_inode(found_child, dentry, inode); + else + found_child->ino = (unsigned long)-1; } EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2299,14 +2302,14 @@ static atomic_t session_id = ATOMIC_INIT(0); * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; struct audit_context *context = task->audit_context; unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) + if (uid_valid(task->loginuid)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2322,8 +2325,10 @@ int audit_set_loginuid(uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task_uid(task), - task->loginuid, loginuid, + task->pid, + from_kuid(&init_user_ns, task_uid(task)), + from_kuid(&init_user_ns, task->loginuid), + from_kuid(&init_user_ns, loginuid), task->sessionid, sessionid); audit_log_end(ab); } @@ -2546,12 +2551,12 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; - uid_t uid = current_uid(), t_uid = task_uid(t); + kuid_t uid = current_uid(), t_uid = task_uid(t); if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) + if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else audit_sig_uid = uid; @@ -2672,8 +2677,8 @@ void __audit_mmap_fd(int fd, int flags) static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) { - uid_t auid, uid; - gid_t gid; + kuid_t auid, uid; + kgid_t gid; unsigned int sessionid; auid = audit_get_loginuid(current); @@ -2681,7 +2686,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); + from_kuid(&init_user_ns, auid), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..4855892798fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by * cgroup_mutex. */ -#define SUBSYS(_x) &_x ## _subsys, +#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; @@ -111,13 +112,13 @@ struct cgroupfs_root { * The bitmask of subsystems intended to be attached to this * hierarchy */ - unsigned long subsys_bits; + unsigned long subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_bits; + unsigned long actual_subsys_mask; /* A list running through the attached subsystems */ struct list_head subsys_list; @@ -137,6 +138,9 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -170,8 +174,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -241,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + struct cftype cfts[], bool is_add); + #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) { @@ -276,7 +284,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) /* bits in struct cgroupfs_root flags field */ enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_XATTR, /* supports extended attributes */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -292,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -556,7 +560,7 @@ static struct css_set *find_existing_css_set( * won't change, so no need for locking. */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -780,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * The task_lock() exception * * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one task's cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -824,7 +828,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -851,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); - break; - } - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -895,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(cgrp); + ss->css_free(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -912,15 +893,20 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); + simple_xattrs_free(&cgrp->xattrs); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; + struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); kfree(cfe); + simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -963,12 +949,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) return -ENOENT; } -static void cgroup_clear_directory(struct dentry *dir) +/** + * cgroup_clear_directory - selective removal of base and subsystem files + * @dir: directory containing the files + * @base_files: true if the base files should be removed + * @subsys_mask: mask of the subsystem ids whose files should be removed + */ +static void cgroup_clear_directory(struct dentry *dir, bool base_files, + unsigned long subsys_mask) { struct cgroup *cgrp = __d_cgrp(dir); + struct cgroup_subsys *ss; - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); + for_each_subsys(cgrp->root, ss) { + struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; + list_for_each_entry(set, &ss->cftsets, node) + cgroup_addrm_files(cgrp, NULL, set->cfts, false); + } + if (base_files) { + while (!list_empty(&cgrp->files)) + cgroup_rm_file(cgrp, NULL); + } } /* @@ -977,8 +980,9 @@ static void cgroup_clear_directory(struct dentry *dir) static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - cgroup_clear_directory(dentry); + cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -990,54 +994,27 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_bits) + unsigned long final_subsys_mask) { - unsigned long added_bits, removed_bits; + unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_bits = root->actual_subsys_bits & ~final_bits; - added_bits = final_bits & ~root->actual_subsys_bits; + removed_mask = root->actual_subsys_mask & ~final_subsys_mask; + added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) + if (!(bit & added_mask)) continue; /* * Nobody should tell us to do a subsys that doesn't exist: @@ -1062,7 +1039,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; unsigned long bit = 1UL << i; - if (bit & added_bits) { + if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); @@ -1075,7 +1052,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgrp); /* refcount was already taken, and we're keeping it */ - } else if (bit & removed_bits) { + } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); @@ -1088,7 +1065,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &rootnode.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_bits) { + } else if (bit & final_subsys_mask) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); @@ -1105,7 +1082,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); } } - root->subsys_bits = root->actual_subsys_bits = final_bits; + root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; synchronize_rcu(); return 0; @@ -1121,9 +1098,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); + if (test_bit(ROOT_XATTR, &root->flags)) + seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (clone_children(&root->top_cgroup)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1132,10 +1111,10 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) } struct cgroup_sb_opts { - unsigned long subsys_bits; + unsigned long subsys_mask; unsigned long flags; char *release_agent; - bool clone_children; + bool cpuset_clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1186,7 +1165,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "clone_children")) { - opts->clone_children = true; + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + set_bit(ROOT_XATTR, &opts->flags); continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1237,7 +1220,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); one_ss = true; break; @@ -1258,7 +1241,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (ss->disabled) continue; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); } } @@ -1270,19 +1253,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * the cpuset subsystem. */ if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_bits & mask)) + (opts->subsys_mask & mask)) return -EINVAL; /* Can't specify "none" and some subsystems */ - if (opts->subsys_bits && opts->none) + if (opts->subsys_mask && opts->none) return -EINVAL; /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_bits && !opts->name) + if (!opts->subsys_mask && !opts->name) return -EINVAL; /* @@ -1291,10 +1274,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * take duplicate reference counts on a subsystem that's already used, * but rebind_subsystems handles this case. */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; if (!try_module_get(subsys[i]->module)) { module_pin_failed = true; @@ -1307,11 +1290,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * raced with a module_delete call, and to the user this is * essentially a "subsystem doesn't exist" case. */ - for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + for (i--; i >= 0; i--) { /* drop refcounts only on the ones we took */ unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1321,13 +1304,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_bits) +static void drop_parsed_module_refcounts(unsigned long subsys_mask) { int i; - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & subsys_bits)) + if (!(bit & subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1339,6 +1322,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; + unsigned long added_mask, removed_mask; mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1349,28 +1333,38 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* See feature-removal-schedule.txt */ - if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) + if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - ret = rebind_subsystems(root, opts.subsys_bits); + /* + * Clear out the files of subsystems that should be removed, do + * this before rebind_subsystems, since rebind_subsystems may + * change this hierarchy's subsys_list. + */ + cgroup_clear_directory(cgrp->dentry, false, removed_mask); + + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { - drop_parsed_module_refcounts(opts.subsys_bits); + /* rebind_subsystems failed, re-populate the removed files */ + cgroup_populate_dir(cgrp, false, removed_mask); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry); - cgroup_populate_dir(cgrp); + /* re-populate subsystem files */ + cgroup_populate_dir(cgrp, false, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1396,11 +1390,13 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); + simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1413,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static bool init_root_id(struct cgroupfs_root *root) @@ -1455,8 +1451,8 @@ static int cgroup_test_super(struct super_block *sb, void *data) * If we asked for subsystems (or explicitly for no * subsystems) then they must match */ - if ((opts->subsys_bits || opts->none) - && (opts->subsys_bits != root->subsys_bits)) + if ((opts->subsys_mask || opts->none) + && (opts->subsys_mask != root->subsys_mask)) return 0; return 1; @@ -1466,7 +1462,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; - if (!opts->subsys_bits && !opts->none) + if (!opts->subsys_mask && !opts->none) return NULL; root = kzalloc(sizeof(*root), GFP_KERNEL); @@ -1479,14 +1475,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) } init_cgroup_root(root); - root->subsys_bits = opts->subsys_bits; + root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; + ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); - if (opts->clone_children) - set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); + if (opts->cpuset_clone_children) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -1499,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); spin_unlock(&hierarchy_id_lock); + ida_destroy(&root->cgroup_ida); kfree(root); } @@ -1511,7 +1509,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) if (!opts->new_root) return -EINVAL; - BUG_ON(!opts->subsys_bits && !opts->none); + BUG_ON(!opts->subsys_mask && !opts->none); ret = set_anon_super(sb, NULL); if (ret) @@ -1629,7 +1627,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_bits); + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); goto unlock_drop; @@ -1664,12 +1662,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&root_cgrp->sibling)); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp); + cgroup_populate_dir(root_cgrp, true, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -1681,7 +1678,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1695,7 +1692,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); drop_modules: - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -1713,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1745,6 +1741,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + simple_xattrs_free(&cgrp->xattrs); + kill_litter_super(sb); cgroup_drop_root(root); } @@ -1769,9 +1767,11 @@ static struct kobject *cgroup_kobj; */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { + struct dentry *dentry = cgrp->dentry; char *start; - struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + + rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), + "cgroup_path() called without proper locking"); if (!dentry || cgrp == dummytop) { /* @@ -1782,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } - start = buf + buflen; + start = buf + buflen - 1; - *--start = '\0'; + *start = '\0'; for (;;) { int len = dentry->d_name.len; @@ -1795,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) if (!cgrp) break; - dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + dentry = cgrp->dentry; if (!cgrp->parent) continue; if (--start < buf) @@ -1891,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); /* * cgroup_task_migrate - move a task from one cgroup to another. * - * 'guarantee' is set if the caller promises that a new css_set for the task - * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex and threadgroup locked. */ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) @@ -1923,9 +1920,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); } /** @@ -1987,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2162,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -2551,6 +2540,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } +static struct simple_xattrs *__d_xattrs(struct dentry *dentry) +{ + if (S_ISDIR(dentry->d_inode->i_mode)) + return &__d_cgrp(dentry)->xattrs; + else + return &__d_cft(dentry)->xattrs; +} + +static inline int xattr_enabled(struct dentry *dentry) +{ + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + return test_bit(ROOT_XATTR, &root->flags); +} + +static bool is_valid_xattr(const char *name) +{ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + return true; + return false; +} + +static int cgroup_setxattr(struct dentry *dentry, const char *name, + const void *val, size_t size, int flags) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); +} + +static int cgroup_removexattr(struct dentry *dentry, const char *name) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_remove(__d_xattrs(dentry), name); +} + +static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, + void *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_get(__d_xattrs(dentry), name, buf, size); +} + +static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + return simple_xattr_list(__d_xattrs(dentry), buf, size); +} + static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, @@ -2559,11 +2606,22 @@ static const struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; +static const struct inode_operations cgroup_file_inode_operations = { + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, +}; + static const struct inode_operations cgroup_dir_inode_operations = { .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, }; static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2604,45 +2662,27 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, /* start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); + inc_nlink(dentry->d_parent->d_inode); - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + /* + * Control reaches here with cgroup_mutex held. + * @inode->i_mutex should nest outside cgroup_mutex but we + * want to populate it immediately without releasing + * cgroup_mutex. As @inode isn't visible to anyone else + * yet, trylock will always succeed without affecting + * lockdep checks. + */ + WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; + inode->i_op = &cgroup_file_inode_operations; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return 0; } -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - umode_t mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); - } - dput(dentry); - - return error; -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -2671,7 +2711,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype *cft) + struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2681,11 +2721,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - /* does @cft->flags tell us to skip creation on @cgrp? */ - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) - return 0; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) - return 0; + simple_xattrs_init(&cft->xattrs); if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); @@ -2721,12 +2757,18 @@ out: } static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype cfts[], bool is_add) + struct cftype cfts[], bool is_add) { - const struct cftype *cft; + struct cftype *cft; int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + continue; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + continue; + if (is_add) err = cgroup_add_file(cgrp, subsys, cft); else @@ -2757,7 +2799,7 @@ static void cgroup_cfts_prepare(void) } static void cgroup_cfts_commit(struct cgroup_subsys *ss, - const struct cftype *cfts, bool is_add) + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) { LIST_HEAD(pending); @@ -2808,7 +2850,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -2838,7 +2880,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered with @ss. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -2934,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_pre(). Find the next + * descendant to visit for pre-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, pretend we just visited @cgroup */ + if (!pos) { + if (list_empty(&cgroup->children)) + return NULL; + pos = cgroup; + } + + /* visit the first child if exists */ + next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + do { + next = list_entry_rcu(pos->sibling.next, struct cgroup, + sibling); + if (&next->sibling != &pos->parent->children) + return next; + + pos = pos->parent; + } while (pos != cgroup); + + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); + +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +{ + struct cgroup *last; + + do { + last = pos; + pos = list_first_or_null_rcu(&pos->children, struct cgroup, + sibling); + } while (pos); + + return last; +} + +/** + * cgroup_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_post(). Find the next + * descendant to visit for post-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit the leftmost descendant */ + if (!pos) { + next = cgroup_leftmost_descendant(cgroup); + return next != cgroup ? next : NULL; + } + + /* if there's an unvisited sibling, visit its leftmost descendant */ + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return cgroup_leftmost_descendant(next); + + /* no sibling left, visit parent */ + next = pos->parent; + return next != cgroup ? next : NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { @@ -3280,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case @@ -3647,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, if (flags & POLLHUP) { __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); - list_del(&event->list); + list_del_init(&event->list); spin_unlock(&cgrp->event_list_lock); /* * We are in atomic context, but cgroup_event_remove() may @@ -3784,7 +3912,7 @@ fail: static u64 cgroup_clone_children_read(struct cgroup *cgrp, struct cftype *cft) { - return clone_children(cgrp); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); } static int cgroup_clone_children_write(struct cgroup *cgrp, @@ -3792,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, u64 val) { if (val) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); else - clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); return 0; } @@ -3843,18 +3971,29 @@ static struct cftype files[] = { { } /* terminate */ }; -static int cgroup_populate_dir(struct cgroup *cgrp) +/** + * cgroup_populate_dir - selectively creation of files in a directory + * @cgrp: target cgroup + * @base_files: true if the base files should be added + * @subsys_mask: mask of the subsystem ids whose files should be added + */ +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { int err; struct cgroup_subsys *ss; - err = cgroup_addrm_files(cgrp, NULL, files, true); - if (err < 0) - return err; + if (base_files) { + err = cgroup_addrm_files(cgrp, NULL, files, true); + if (err < 0) + return err; + } /* process cftsets of each subsystem */ for_each_subsys(cgrp->root, ss) { struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, ss, set->cfts, true); @@ -3896,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags = 0; css->id = NULL; if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); + css->flags |= CSS_ROOT; BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); +} + +/* invoke ->post_create() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + int ret = 0; + + lockdep_assert_held(&cgroup_mutex); + + if (ss->css_online) + ret = ss->css_online(cgrp); + if (!ret) + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + return ret; +} + +/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + lockdep_assert_held(&cgroup_mutex); + + if (!(css->flags & CSS_ONLINE)) + return; + + /* + * css_offline() should be called with cgroup_mutex unlocked. See + * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for + * details. This temporary unlocking should go away once + * cgroup_mutex is unexported from controllers. + */ + if (ss->css_offline) { + mutex_unlock(&cgroup_mutex); + ss->css_offline(cgrp); + mutex_lock(&cgroup_mutex); + } + + cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; } /* @@ -3928,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys *ss; struct super_block *sb = root->sb; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) + goto err_free_cgrp; + + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free_id; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -3939,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -3950,71 +4142,90 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - if (clone_children(parent)) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(cgrp); + struct cgroup_subsys_state *css; + css = ss->css_alloc(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); - goto err_destroy; + goto err_free_all; } init_cgroup_css(css, ss, cgrp); if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) - goto err_destroy; + goto err_free_all; } - /* At error, ->destroy() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) - ss->post_clone(cgrp); } - list_add(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - err = cgroup_create_dir(cgrp, dentry, mode); + /* + * Create directory. cgroup_create_file() returns with the new + * directory locked on success so that it can be populated without + * dropping cgroup_mutex. + */ + err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_remove; + goto err_free_all; + lockdep_assert_held(&dentry->d_inode->i_mutex); + + /* allocation complete, commit to creation */ + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + root->number_of_cgroups++; - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); + dget(dentry); - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); + /* creation succeeded, notify subsystems */ + for_each_subsys(root, ss) { + err = online_css(ss, cgrp); + if (err) + goto err_destroy; - list_add_tail(&cgrp->allcg_node, &root->allcg_list); + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } + } - err = cgroup_populate_dir(cgrp); - /* If err < 0, we have a half-filled directory - oh well ;) */ + err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + if (err) + goto err_destroy; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; - err_remove: - - list_del(&cgrp->sibling); - root->number_of_cgroups--; - - err_destroy: - +err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(cgrp); + ss->css_free(cgrp); } - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free_id: + ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_cgrp: kfree(cgrp); return err; + +err_destroy: + cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + return err; } static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4066,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) +static int cgroup_destroy_locked(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct dentry *d = cgrp->dentry; + struct cgroup *parent = cgrp->parent; + DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; + LIST_HEAD(tmp_list); + + lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); - local_irq_save(flags); + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) + return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; } + set_bit(CGRP_REMOVED, &cgrp->flags); - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } - } - - local_irq_restore(flags); - return !failed; -} - -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -{ - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; - DEFINE_WAIT(wait); - struct cgroup_event *event, *tmp; - int ret; - - /* the vfs holds both inode->i_mutex already */ -again: - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - mutex_unlock(&cgroup_mutex); + /* tell subsystems to initate destruction */ + for_each_subsys(cgrp->root, ss) + offline_css(ss, cgrp); /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; - } - - mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; - } - /* NO css_tryget() can success after here. */ - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); /* delete this cgroup from parent->children */ - list_del_init(&cgrp->sibling); - + list_del_rcu(&cgrp->sibling); list_del_init(&cgrp->allcg_node); - d = dget(cgrp->dentry); - + dget(d); cgroup_d_remove_dir(d); dput(d); @@ -4222,21 +4340,35 @@ again: /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace + * directory to avoid race between userspace and kernelspace. Use + * a temporary list to avoid a deadlock with cgroup_event_wake(). Since + * cgroup_event_wake() is called with the wait queue head locked, + * remove_wait_queue() cannot be called while holding event_list_lock. */ spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del(&event->list); + list_splice_init(&cgrp->event_list, &tmp_list); + spin_unlock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_del_init(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); - mutex_unlock(&cgroup_mutex); return 0; } +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(dentry->d_fsdata); + mutex_unlock(&cgroup_mutex); + + return ret; +} + static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); @@ -4257,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_mutex); + /* init base cftset */ cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4272,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + init_css_set.subsys[ss->subsys_id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4282,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; + BUG_ON(online_css(ss, dummytop)); + + mutex_unlock(&cgroup_mutex); /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ @@ -4299,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { - int i; struct cgroup_subsys_state *css; + int i, ret; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->create == NULL || ss->destroy == NULL) + ss->css_alloc == NULL || ss->css_free == NULL) return -EINVAL; /* @@ -4321,8 +4458,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * since cgroup_init_subsys will have already taken care of it. */ if (ss->module == NULL) { - /* a few sanity checks */ - BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + /* a sanity check */ BUG_ON(subsys[ss->subsys_id] != ss); return 0; } @@ -4330,33 +4466,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* init base cftset */ cgroup_init_cftsets(ss); - /* - * need to register a subsys id before anything else - for example, - * init_cgroup_css needs it. - */ mutex_lock(&cgroup_mutex); - /* find the first empty slot in the array */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { - if (subsys[i] == NULL) - break; - } - if (i == CGROUP_SUBSYS_COUNT) { - /* maximum number of subsystems already registered! */ - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - /* assign ourselves the subsys_id */ - ss->subsys_id = i; - subsys[i] = ss; + subsys[ss->subsys_id] = ss; /* - * no ss->create seems to need anything important in the ss struct, so - * this can happen first (i.e. before the rootnode attachment). + * no ss->css_alloc seems to need anything important in the ss + * struct, so this can happen first (i.e. before the rootnode + * attachment). */ - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } @@ -4368,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) init_cgroup_css(css, ss, dummytop); /* init_idr must be after init_cgroup_css because it sets css->id. */ if (ss->use_id) { - int ret = cgroup_init_idr(ss, css); - if (ret) { - dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(dummytop); - subsys[i] = NULL; - mutex_unlock(&cgroup_mutex); - return ret; - } + ret = cgroup_init_idr(ss, css); + if (ret) + goto err_unload; } /* @@ -4408,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; + ret = online_css(ss, dummytop); + if (ret) + goto err_unload; /* success! */ mutex_unlock(&cgroup_mutex); return 0; + +err_unload: + mutex_unlock(&cgroup_mutex); + /* @ss can't be mounted here as try_module_get() would fail */ + cgroup_unload_subsys(ss); + return ret; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); @@ -4438,8 +4563,16 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) BUG_ON(ss->root != &rootnode); mutex_lock(&cgroup_mutex); + + offline_css(ss, dummytop); + ss->active = 0; + + if (ss->use_id) { + idr_remove_all(&ss->idr); + idr_destroy(&ss->idr); + } + /* deassign the subsys_id */ - BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ @@ -4454,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) struct css_set *cg = link->cg; hlist_del(&cg->hlist); - BUG_ON(!cg->subsys[ss->subsys_id]); cg->subsys[ss->subsys_id] = NULL; hhead = css_set_hash(cg->subsys); hlist_add_head(&cg->hlist, hhead); @@ -4462,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to free - * before marking as null because ss->destroy needs the cgrp->subsys - * pointer to find their state. note that this also takes care of - * freeing the css_id. + * remove subsystem's css from the dummytop and free it - need to + * free before marking as null because ss->css_free needs the + * cgrp->subsys pointer to find their state. note that this also + * takes care of freeing the css_id. */ - ss->destroy(dummytop); + ss->css_free(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4502,14 +4634,17 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; + BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); + BUG_ON(!ss->css_alloc); + BUG_ON(!ss->css_free); if (ss->subsys_id != i) { printk(KERN_ERR "cgroup: Subsys %s id == %d\n", ss->name, ss->subsys_id); @@ -4538,9 +4673,12 @@ int __init cgroup_init(void) if (err) return err; - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) @@ -4695,70 +4833,37 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ + task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); + task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } /** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - /* - * forkexit callbacks are only supported for builtin - * subsystems, and the builtin section of the subsys array is - * immutable, so we don't need to lock the subsys array here. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->fork) - ss->fork(child); - } - } -} - -/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4772,22 +4877,36 @@ void cgroup_post_fork(struct task_struct *child) */ if (use_task_css_set_links) { write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { + task_lock(child); + if (list_empty(&child->cg_list)) + list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); + write_unlock(&css_set_lock); + } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. */ - list_add(&child->cg_list, &child->cgroups->tasks); + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); } - write_unlock(&css_set_lock); } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process @@ -4846,12 +4965,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* modular subsystems can't use callbacks */ + if (!ss || ss->module) + continue; + if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -4919,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); @@ -4946,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); @@ -5037,13 +5157,17 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about module - * subsystems, so we don't worry about them. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* + * cgroup_disable, being at boot time, can't + * know about module subsystems, so we don't + * worry about them. + */ + if (!ss || ss->module) + continue; + if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5332,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5342,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) return css; } -static void debug_destroy(struct cgroup *cont) +static void debug_css_free(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } @@ -5471,8 +5595,8 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_subsys = { .name = "debug", - .create = debug_create, - .destroy = debug_destroy, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3eaa..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,24 +22,33 @@ #include <linux/freezer.h> #include <linux/seq_file.h> -enum freezer_state { - CGROUP_THAWED = 0, - CGROUP_FREEZING, - CGROUP_FROZEN, +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { - struct cgroup_subsys_state css; - enum freezer_state state; - spinlock_t lock; /* protects _writes_ to state */ + struct cgroup_subsys_state css; + unsigned int state; + spinlock_t lock; }; -static inline struct freezer *cgroup_freezer( - struct cgroup *cgroup) +static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of( - cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); + return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) @@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } +static struct freezer *parent_freezer(struct freezer *freezer) +{ + struct cgroup *pcg = freezer->css.cgroup->parent; + + if (pcg) + return cgroup_freezer(pcg); + return NULL; +} + bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state; bool ret; rcu_read_lock(); - state = task_freezer(task)->state; - ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task) * cgroups_write_string() limits the size of freezer state strings to * CGROUP_LOCAL_BUFFER_SIZE */ -static const char *freezer_state_strs[] = { - "THAWED", - "FREEZING", - "FROZEN", +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; }; -/* - * State diagram - * Transitions are caused by userspace writes to the freezer.state file. - * The values in parenthesis are state labels. The rest are edge labels. - * - * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______THAWED_______/ | - * \__________________________THAWED____________/ - */ - struct cgroup_subsys freezer_subsys; -/* Locks taken and their ordering - * ------------------------------ - * cgroup_mutex (AKA cgroup_lock) - * freezer->lock - * css_set_lock - * task->alloc_lock (AKA task_lock) - * task->sighand->siglock - * - * cgroup code forces css_set_lock to be taken before task->alloc_lock - * - * freezer_create(), freezer_destroy(): - * cgroup_mutex [ by cgroup core ] - * - * freezer_can_attach(): - * cgroup_mutex (held by caller of can_attach) - * - * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * freezer->lock - * sighand->siglock (if the cgroup is freezing) - * - * freezer_read(): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * - * freezer_write() (freeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock (fake signal delivery inside freeze_task()) - * - * freezer_write() (unfreeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) - * sighand->siglock - */ -static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) { struct freezer *freezer; @@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); spin_lock_init(&freezer->lock); - freezer->state = CGROUP_THAWED; return &freezer->css; } -static void freezer_destroy(struct cgroup *cgroup) +/** + * freezer_css_online - commit creation of a freezer cgroup + * @cgroup: cgroup being created + * + * We're committing to creation of @cgroup. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup *cgroup) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *parent = parent_freezer(freezer); + + /* + * The following double locking and freezing state inheritance + * guarantee that @cgroup can never escape ancestors' freezing + * states. See cgroup_for_each_descendant_pre() for details. + */ + if (parent) + spin_lock_irq(&parent->lock); + spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + spin_unlock(&freezer->lock); + if (parent) + spin_unlock_irq(&parent->lock); + + return 0; +} + +/** + * freezer_css_offline - initiate destruction of @cgroup + * @cgroup: cgroup being destroyed + * + * @cgroup is going away. Mark it dead and decrement system_freezing_count + * if it was holding one. + */ +static void freezer_css_offline(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); - if (freezer->state != CGROUP_THAWED) + spin_lock_irq(&freezer->lock); + + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); - kfree(freezer); + + freezer->state = 0; + + spin_unlock_irq(&freezer->lock); } -/* task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) +static void freezer_css_free(struct cgroup *cgroup) { - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); + kfree(cgroup_freezer(cgroup)); } /* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. */ -static int freezer_can_attach(struct cgroup *new_cgroup, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + bool clear_frozen = false; + + spin_lock_irq(&freezer->lock); /* - * Anything frozen can't move or be moved to/from. + * Make the new tasks conform to the current state of @new_cgrp. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_cgrp but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgroup, tset) - if (cgroup_freezing(task)) - return -EBUSY; + cgroup_taskset_for_each(task, new_cgrp, tset) { + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + freezer->state &= ~CGROUP_FROZEN; + clear_frozen = true; + } + } - freezer = cgroup_freezer(new_cgroup); - if (freezer->state != CGROUP_THAWED) - return -EBUSY; + spin_unlock_irq(&freezer->lock); - return 0; + /* + * Propagate FROZEN clearing upwards. We may race with + * update_if_frozen(), but as long as both work bottom-up, either + * update_if_frozen() sees child's FROZEN cleared or we clear the + * parent's FROZEN later. No parent w/ !FROZEN children can be + * left FROZEN. + */ + while (clear_frozen && (freezer = parent_freezer(freezer))) { + spin_lock_irq(&freezer->lock); + freezer->state &= ~CGROUP_FROZEN; + clear_frozen = freezer->state & CGROUP_FREEZING; + spin_unlock_irq(&freezer->lock); + } } static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - /* - * No lock is needed, since the task isn't on tasklist yet, - * so it can't be moved to another cgroup, which means the - * freezer won't be removed and will be valid during this - * function call. Nevertheless, apply RCU read-side critical - * section to suppress RCU lockdep false positives. - */ rcu_read_lock(); freezer = task_freezer(task); - rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the * following check. */ if (!freezer->css.cgroup->parent) - return; + goto out; spin_lock_irq(&freezer->lock); - BUG_ON(freezer->state == CGROUP_FROZEN); - - /* Locking avoids race with FREEZING -> THAWED transitions. */ - if (freezer->state == CGROUP_FREEZING) + if (freezer->state & CGROUP_FREEZING) freeze_task(task); spin_unlock_irq(&freezer->lock); +out: + rcu_read_unlock(); } -/* - * caller must hold freezer->lock +/** + * update_if_frozen - update whether a cgroup finished freezing + * @cgroup: cgroup of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @cgroup, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup) { + struct freezer *freezer = cgroup_freezer(cgroup); + struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - enum freezer_state old_state = freezer->state; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (freezing(task) && is_task_frozen_enough(task)) - nfrozen++; + WARN_ON_ONCE(!rcu_read_lock_held()); + + spin_lock_irq(&freezer->lock); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + goto out_unlock; + + /* are all (live) children frozen? */ + cgroup_for_each_child(pos, cgroup) { + struct freezer *child = cgroup_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) + goto out_unlock; } - if (old_state == CGROUP_THAWED) { - BUG_ON(nfrozen > 0); - } else if (old_state == CGROUP_FREEZING) { - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - } else { /* old_state == CGROUP_FROZEN */ - BUG_ON(nfrozen != ntotal); + /* are all tasks frozen? */ + cgroup_iter_start(cgroup, &it); + + while ((task = cgroup_iter_next(cgroup, &it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } } + freezer->state |= CGROUP_FROZEN; +out_iter_end: cgroup_iter_end(cgroup, &it); +out_unlock: + spin_unlock_irq(&freezer->lock); } static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { - struct freezer *freezer; - enum freezer_state state; + struct cgroup *pos; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; + rcu_read_lock(); - freezer = cgroup_freezer(cgroup); - spin_lock_irq(&freezer->lock); - state = freezer->state; - if (state == CGROUP_FREEZING) { - /* We change from FREEZING to FROZEN lazily if the cgroup was - * only partially frozen when we exitted write. */ - update_if_frozen(cgroup, freezer); - state = freezer->state; - } - spin_unlock_irq(&freezer->lock); - cgroup_unlock(); + /* update states bottom-up */ + cgroup_for_each_descendant_post(pos, cgroup) + update_if_frozen(pos); + update_if_frozen(cgroup); - seq_puts(m, freezer_state_strs[state]); + rcu_read_unlock(); + + seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); seq_putc(m, '\n'); return 0; } -static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; - unsigned int num_cant_freeze_now = 0; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task)) - continue; - if (is_task_frozen_enough(task)) - continue; - if (!freezing(task) && !freezer_should_skip(task)) - num_cant_freeze_now++; - } + while ((task = cgroup_iter_next(cgroup, &it))) + freeze_task(task); cgroup_iter_end(cgroup, &it); - - return num_cant_freeze_now ? -EBUSY : 0; } -static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void unfreeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static int freezer_change_state(struct cgroup *cgroup, - enum freezer_state goal_state) +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) { - struct freezer *freezer; - int retval = 0; - - freezer = cgroup_freezer(cgroup); + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer->lock); - spin_lock_irq(&freezer->lock); + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; - update_if_frozen(cgroup, freezer); - - switch (goal_state) { - case CGROUP_THAWED: - if (freezer->state != CGROUP_THAWED) - atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; - unfreeze_cgroup(cgroup, freezer); - break; - case CGROUP_FROZEN: - if (freezer->state == CGROUP_THAWED) + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); - freezer->state = CGROUP_FREEZING; - retval = try_to_freeze_cgroup(cgroup, freezer); - break; - default: - BUG(); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup *pos; + /* update @freezer */ + spin_lock_irq(&freezer->lock); + freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); spin_unlock_irq(&freezer->lock); - return retval; + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + rcu_read_lock(); + cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { + struct freezer *pos_f = cgroup_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + /* + * Our update to @parent->state is already visible which is + * all we need. No need to lock @parent. For more info on + * synchronization, see freezer_post_create(). + */ + spin_lock_irq(&pos_f->lock); + freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + spin_unlock_irq(&pos_f->lock); + } + rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, - struct cftype *cft, +static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - int retval; - enum freezer_state goal_state; + bool freeze; - if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) - goal_state = CGROUP_THAWED; - else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) - goal_state = CGROUP_FROZEN; + if (strcmp(buffer, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; else return -EINVAL; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - retval = freezer_change_state(cgroup, goal_state); - cgroup_unlock(); - return retval; + freezer_change_state(cgroup_freezer(cgroup), freeze); + return 0; +} + +static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } static struct cftype files[] = { @@ -362,15 +462,27 @@ static struct cftype files[] = { .read_seq_string = freezer_read, .write_string = freezer_write, }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, { } /* terminate */ }; struct cgroup_subsys freezer_subsys = { .name = "freezer", - .create = freezer_create, - .destroy = freezer_destroy, + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, .subsys_id = freezer_subsys_id, - .can_attach = freezer_can_attach, + .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, }; diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } +#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL +asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval) +{ + struct timespec t; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); + set_fs(old_fs); + if (put_compat_timespec(&t, interval)) + return -EFAULT; + return ret; +} +#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ + /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 000000000000..e0e07fd55508 --- /dev/null +++ b/kernel/context_tracking.c @@ -0,0 +1,83 @@ +#include <linux/context_tracking.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> + +struct context_tracking { + /* + * When active is false, hooks are not set to + * minimize overhead: TIF flags are cleared + * and calls to user_enter/exit are ignored. This + * may be further optimized using static keys. + */ + bool active; + enum { + IN_KERNEL = 0, + IN_USER, + } state; +}; + +static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_FORCE + .active = true, +#endif +}; + +void user_enter(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.active) && + __this_cpu_read(context_tracking.state) != IN_USER) { + __this_cpu_write(context_tracking.state, IN_USER); + rcu_user_enter(); + } + local_irq_restore(flags); +} + +void user_exit(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.state) == IN_USER) { + __this_cpu_write(context_tracking.state, IN_KERNEL); + rcu_user_exit(); + } + local_irq_restore(flags); +} + +void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (__this_cpu_read(context_tracking.active)) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588cccd..3046a503242c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,6 +80,10 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); + + if (WARN_ON(!cpu_hotplug.refcount)) + cpu_hotplug.refcount++; /* try to fix things up */ + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); @@ -280,12 +284,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + smpboot_park_threads(cpu); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ + smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_release; } BUG_ON(cpu_online(cpu)); @@ -343,17 +348,23 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct task_struct *idle; - if (cpu_online(cpu) || !cpu_present(cpu)) - return -EINVAL; - cpu_hotplug_begin(); + if (cpu_online(cpu) || !cpu_present(cpu)) { + ret = -EINVAL; + goto out; + } + idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); goto out; } + ret = smpboot_create_threads(cpu); + if (ret) + goto out; + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -368,6 +379,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + /* Wake the per cpu threads */ + smpboot_unpark_threads(cpu); + /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); @@ -439,14 +453,6 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -void __weak arch_disable_nonboot_cpus_begin(void) -{ -} - -void __weak arch_disable_nonboot_cpus_end(void) -{ -} - int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -458,7 +464,6 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -474,8 +479,6 @@ int disable_nonboot_cpus(void) } } - arch_disable_nonboot_cpus_end(); - if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ @@ -600,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, static int __init cpu_hotplug_pm_sync_init(void) { + /* + * cpu_hotplug_pm_callback has higher priority than x86 + * bsp_pm_callback which depends on cpu_hotplug_pm_callback + * to disable cpu hotplug to avoid cpu hotplug race. + */ pm_notifier(cpu_hotplug_pm_callback, 0); return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_HIGH_MEMORY]. + * found any online mems, return node_states[N_MEMORY]. * * One way or another, we guarantee to return some non-empty subset - * of node_states[N_HIGH_MEMORY]. + * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ @@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_HIGH_MEMORY])) + node_states[N_MEMORY])) cs = cs->parent; if (cs) nodes_and(*pmask, cs->mems_allowed, - node_states[N_HIGH_MEMORY]); + node_states[N_MEMORY]); else - *pmask = node_states[N_HIGH_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); + *pmask = node_states[N_MEMORY]; + BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); } /* @@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, return -ENOMEM; /* - * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; + * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; * it's read-only */ if (cs == &top_cpuset) { @@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) { + node_states[N_MEMORY])) { retval = -EINVAL; goto done; } @@ -1784,56 +1784,20 @@ static struct cftype files[] = { }; /* - * post_clone() is called during cgroup_create() when the - * clone_children mount argument was specified. The cgroup - * can not yet have any tasks. - * - * Currently we refuse to set up the cgroup - thereby - * refusing the task to be entered, and as a result refusing - * the sys_unshare() or clone() which initiated it - if any - * sibling cpusets have exclusive cpus or mem. - * - * If this becomes a problem for some users who wish to - * allow that scenario, then cpuset_post_clone() could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. Called with cgroup_mutex - * held. - */ -static void cpuset_post_clone(struct cgroup *cgroup) -{ - struct cgroup *parent, *child; - struct cpuset *cs, *parent_cs; - - parent = cgroup->parent; - list_for_each_entry(child, &parent->children, sibling) { - cs = cgroup_cs(child); - if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) - return; - } - cs = cgroup_cs(cgroup); - parent_cs = cgroup_cs(parent); - - mutex_lock(&callback_mutex); - cs->mems_allowed = parent_cs->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); - mutex_unlock(&callback_mutex); - return; -} - -/* - * cpuset_create - create a cpuset + * cpuset_css_alloc - allocate a cpuset css * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cpuset *cs; - struct cpuset *parent; + struct cgroup *parent_cg = cont->parent; + struct cgroup *tmp_cg; + struct cpuset *parent, *cs; - if (!cont->parent) { + if (!parent_cg) return &top_cpuset.css; - } - parent = cgroup_cs(cont->parent); + parent = cgroup_cs(parent_cg); + cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); @@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) cs->parent = parent; number_of_cpusets++; - return &cs->css ; + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) + goto skip_clone; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * histrical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { + struct cpuset *tmp_cs = cgroup_cs(tmp_cg); + + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) + goto skip_clone; + } + + mutex_lock(&callback_mutex); + cs->mems_allowed = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + mutex_unlock(&callback_mutex); +skip_clone: + return &cs->css; } /* @@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup *cont) +static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); @@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", - .create = cpuset_create, - .destroy = cpuset_destroy, + .css_alloc = cpuset_css_alloc, + .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, - .post_clone = cpuset_post_clone, .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, @@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) * before dropping down to the next. It always processes a node before * any of its children. * - * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * In the case of memory hot-unplug, it will remove nodes from N_MEMORY * if all present pages from a node are offlined. */ static void @@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) /* Continue past cpusets with all mems online */ if (nodes_subset(cp->mems_allowed, - node_states[N_HIGH_MEMORY])) + node_states[N_MEMORY])) continue; oldmems = cp->mems_allowed; @@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) /* Remove offline mems from this cpuset. */ mutex_lock(&callback_mutex); nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_HIGH_MEMORY]); + node_states[N_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ @@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) #ifdef CONFIG_MEMORY_HOTPLUG /* - * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. + * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, @@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_ONLINE: oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + top_cpuset.mems_allowed = node_states[N_MEMORY]; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; @@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); @@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of node_states[N_HIGH_MEMORY], even if this means going outside the + * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..e0573a43c7df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -30,17 +30,6 @@ static struct kmem_cache *cred_jar; /* - * The common credentials for the initial task's thread group - */ -#ifdef CONFIG_KEYS -static struct thread_group_cred init_tgcred = { - .usage = ATOMIC_INIT(2), - .tgid = 0, - .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), -}; -#endif - -/* * The initial credentials for the initial task */ struct cred init_cred = { @@ -65,9 +54,6 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, -#ifdef CONFIG_KEYS - .tgcred = &init_tgcred, -#endif }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) } /* - * Dispose of the shared task group credentials - */ -#ifdef CONFIG_KEYS -static void release_tgcred_rcu(struct rcu_head *rcu) -{ - struct thread_group_cred *tgcred = - container_of(rcu, struct thread_group_cred, rcu); - - BUG_ON(atomic_read(&tgcred->usage) != 0); - - key_put(tgcred->session_keyring); - key_put(tgcred->process_keyring); - kfree(tgcred); -} -#endif - -/* - * Release a set of thread group credentials. - */ -static void release_tgcred(struct cred *cred) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = cred->tgcred; - - if (atomic_dec_and_test(&tgcred->usage)) - call_rcu(&tgcred->rcu, release_tgcred_rcu); -#endif -} - -/* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) @@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) #endif security_cred_free(cred); + key_put(cred->session_keyring); + key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); - release_tgcred(cred); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); @@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; -#ifdef CONFIG_KEYS - new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); - if (!new->tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } - atomic_set(&new->tgcred->usage, 1); -#endif - atomic_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; @@ -308,9 +256,10 @@ struct cred *prepare_creds(void) get_user_ns(new->user_ns); #ifdef CONFIG_KEYS + key_get(new->session_keyring); + key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); - atomic_inc(&new->tgcred->usage); #endif #ifdef CONFIG_SECURITY @@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); */ struct cred *prepare_exec_creds(void) { - struct thread_group_cred *tgcred = NULL; struct cred *new; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return NULL; -#endif - new = prepare_creds(); - if (!new) { - kfree(tgcred); + if (!new) return new; - } #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; #endif return new; @@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif struct cred *new; int ret; @@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) install_thread_keyring_to_cred(new); } - /* we share the process and session keyrings between all the threads in - * a process - this is slightly icky as we violate COW credentials a - * bit */ + /* The process keyring is only shared between the threads in a process; + * anything outside of those threads doesn't inherit. + */ if (!(clone_flags & CLONE_THREAD)) { - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - ret = -ENOMEM; - goto error_put; - } - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = key_get(new->tgcred->session_keyring); - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; } #endif @@ -455,6 +372,31 @@ error_put: return ret; } +static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) +{ + const struct user_namespace *set_ns = set->user_ns; + const struct user_namespace *subset_ns = subset->user_ns; + + /* If the two credentials are in the same user namespace see if + * the capabilities of subset are a subset of set. + */ + if (set_ns == subset_ns) + return cap_issubset(subset->cap_permitted, set->cap_permitted); + + /* The credentials are in a different user namespaces + * therefore one is a subset of the other only if a set is an + * ancestor of subset and set->euid is owner of subset or one + * of subsets ancestors. + */ + for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { + if ((set_ns == subset_ns->parent) && + uid_eq(subset_ns->owner, set->euid)) + return true; + } + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -493,7 +435,7 @@ int commit_creds(struct cred *new) !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || - !cap_issubset(new->cap_permitted, old->cap_permitted)) { + !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; @@ -643,9 +585,6 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif const struct cred *old; struct cred *new; @@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } -#endif - kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = NULL; - new->tgcred = tgcred; - new->request_key_auth = NULL; + new->session_keyring = NULL; + new->process_keyring = NULL; new->thread_keyring = NULL; + new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif @@ -799,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, atomic_read(&cred->usage), read_cred_subscribers(cred)); printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", - cred->uid, cred->euid, cred->suid, cred->fsuid); + from_kuid_munged(&init_user_ns, cred->uid), + from_kuid_munged(&init_user_ns, cred->euid), + from_kuid_munged(&init_user_ns, cred->suid), + from_kuid_munged(&init_user_ns, cred->fsuid)); printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kgid_munged(&init_user_ns, cred->gid), + from_kgid_munged(&init_user_ns, cred->egid), + from_kgid_munged(&init_user_ns, cred->sgid), + from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY printk(KERN_ERR "CRED: ->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..9a61738cefc8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; + int ret = 0; + + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(0); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; @@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ + goto out; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - return 0; + goto out; - return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); + ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +out: + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(1); + return ret; } +/* + * GDB places a breakpoint at this function to know dynamically + * loaded objects. It's not defined static so that only one instance with this + * name exists in the kernel. + */ + +static int module_event(struct notifier_block *self, unsigned long val, + void *data) +{ + return 0; +} + +static struct notifier_block dbg_module_load_nb = { + .notifier_call = module_event, +}; + int kgdb_nmicallback(int cpu, void *regs) { #ifdef CONFIG_SMP @@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); @@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void) if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); + unregister_module_notifier(&dbg_module_load_nb); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0b..b03e0e814e43 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv) } /* Now the inactive tasks */ kdb_do_each_thread(g, p) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, argcount, btaprompt)) diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f3..14ff4849262c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap) { int diag; int linecount; + int colcount; int logging, saved_loglevel = 0; int saved_trap_printk; int got_printf_lock = 0; @@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap) if (diag || linecount <= 1) linecount = 24; + diag = kdbgetintenv("COLUMNS", &colcount); + if (diag || colcount <= 1) + colcount = 80; + diag = kdbgetintenv("LOGGING", &logging); if (diag) logging = 0; @@ -690,7 +695,7 @@ kdb_printit: gdbstub_msg_write(kdb_buffer, retlen); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(kdb_buffer); + len = retlen; cp = kdb_buffer; while (len--) { dbg_io_ops->write_char(*cp); @@ -709,11 +714,29 @@ kdb_printit: printk(KERN_INFO "%s", kdb_buffer); } - if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) - kdb_nextline++; + if (KDB_STATE(PAGER)) { + /* + * Check printed string to decide how to bump the + * kdb_nextline to control when the more prompt should + * show up. + */ + int got = 0; + len = retlen; + while (len--) { + if (kdb_buffer[len] == '\n') { + kdb_nextline++; + got = 0; + } else if (kdb_buffer[len] == '\r') { + got = 0; + } else { + got++; + } + } + kdb_nextline += got / (colcount + 1); + } /* check for having reached the LINES number of printed lines */ - if (kdb_nextline == linecount) { + if (kdb_nextline >= linecount) { char buf1[16] = ""; /* Watch out for recursion here. Any routine that calls @@ -765,7 +788,7 @@ kdb_printit: kdb_grepping_flag = 0; kdb_printf("\n"); } else if (buf1[0] == ' ') { - kdb_printf("\n"); + kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ } else if (buf1[0] == '\n') { kdb_nextline = linecount - 1; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..4d5f8d5612f3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -21,6 +21,7 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> +#include <linux/atomic.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/init.h> @@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv) } if (!lines--) break; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; kdb_printf("%.*s\n", (int)len - 1, buf); } @@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ + +/* Make sure we balance enable/disable calls, must disable first. */ +static atomic_t kdb_nmi_disabled; + +static int kdb_disable_nmi(int argc, const char *argv[]) +{ + if (atomic_read(&kdb_nmi_disabled)) + return 0; + atomic_set(&kdb_nmi_disabled, 1); + arch_kgdb_ops.enable_nmi(0); + return 0; +} + +static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) +{ + if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) + return -EINVAL; + arch_kgdb_ops.enable_nmi(1); + return 0; +} + +static const struct kernel_param_ops kdb_param_ops_enable_nmi = { + .set = kdb_param_enable_nmi, +}; +module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); + /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void) kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", "Display syslog buffer", 0, KDB_REPEAT_NONE); #endif + if (arch_kgdb_ops.enable_nmi) { + kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); + } kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) int rctx; struct perf_callchain_entry *entry; + int kernel = !event->attr.exclude_callchain_kernel; + int user = !event->attr.exclude_callchain_user; + + if (!kernel && !user) + return NULL; entry = get_callchain_entry(&rctx); if (rctx == -1) @@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) entry->nr = 0; - if (!user_mode(regs)) { + if (kernel && !user_mode(regs)) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; } - if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) - goto exit_put; - - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + if (user) { + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee567153f0..301079d06f24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,7 @@ #include <linux/perf_event.h> #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> +#include <linux/mm_types.h> #include "internal.h" @@ -371,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ /* * perf_cgroup_events says at least one @@ -394,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around */ cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -467,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; + struct fd f = fdget(fd); + int ret = 0; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - css = cgroup_css_from_dir(file, perf_subsys_id); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -500,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - fput_light(file, fput_needed); + fdput(f); return ret; } @@ -3233,21 +3236,18 @@ unlock: static const struct file_operations perf_fops; -static struct file *perf_fget_light(int fd, int *fput_needed) +static inline int perf_fget_light(int fd, struct fd *p) { - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); + if (f.file->f_op != &perf_fops) { + fdput(f); + return -EBADF; } - - return file; + *p = f; + return 0; } static int perf_event_set_output(struct perf_event *event, @@ -3279,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { - struct file *output_file = NULL; - struct perf_event *output_event = NULL; - int fput_needed = 0; int ret; - if (arg != -1) { - output_file = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_file)) - return PTR_ERR(output_file); - output_event = output_file->private_data; + struct perf_event *output_event; + struct fd output; + ret = perf_fget_light(arg, &output); + if (ret) + return ret; + output_event = output.file->private_data; + ret = perf_event_set_output(event, output_event); + fdput(output); + } else { + ret = perf_event_set_output(event, NULL); } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_file, fput_needed); - return ret; } @@ -3677,7 +3674,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -3764,6 +3761,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +static void +perf_output_sample_regs(struct perf_output_handle *handle, + struct pt_regs *regs, u64 mask) +{ + int bit; + + for_each_set_bit(bit, (const unsigned long *) &mask, + sizeof(mask) * BITS_PER_BYTE) { + u64 val; + + val = perf_reg_value(regs, bit); + perf_output_put(handle, val); + } +} + +static void perf_sample_regs_user(struct perf_regs_user *regs_user, + struct pt_regs *regs) +{ + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + regs_user->regs = regs; + regs_user->abi = perf_reg_abi(current); + } +} + +/* + * Get remaining task size from user stack pointer. + * + * It'd be better to take stack vma map and limit this more + * precisly, but there's no way to get it safely under interrupt, + * so using TASK_SIZE as limit. + */ +static u64 perf_ustack_task_size(struct pt_regs *regs) +{ + unsigned long addr = perf_user_stack_pointer(regs); + + if (!addr || addr >= TASK_SIZE) + return 0; + + return TASK_SIZE - addr; +} + +static u16 +perf_sample_ustack_size(u16 stack_size, u16 header_size, + struct pt_regs *regs) +{ + u64 task_size; + + /* No regs, no stack pointer, no dump. */ + if (!regs) + return 0; + + /* + * Check if we fit in with the requested stack size into the: + * - TASK_SIZE + * If we don't, we limit the size to the TASK_SIZE. + * + * - remaining sample size + * If we don't, we customize the stack size to + * fit in to the remaining sample size. + */ + + task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); + stack_size = min(stack_size, (u16) task_size); + + /* Current header size plus static size and dynamic size. */ + header_size += 2 * sizeof(u64); + + /* Do we fit in with the current stack dump size? */ + if ((u16) (header_size + stack_size) < header_size) { + /* + * If we overflow the maximum size for the sample, + * we customize the stack dump size to fit in. + */ + stack_size = USHRT_MAX - header_size - sizeof(u64); + stack_size = round_up(stack_size, sizeof(u64)); + } + + return stack_size; +} + +static void +perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, + struct pt_regs *regs) +{ + /* Case of a kernel thread, nothing to dump */ + if (!regs) { + u64 size = 0; + perf_output_put(handle, size); + } else { + unsigned long sp; + unsigned int rem; + u64 dyn_size; + + /* + * We dump: + * static size + * - the size requested by user or the best one we can fit + * in to the sample max size + * data + * - user stack dump data + * dynamic size + * - the actual dumped size + */ + + /* Static size. */ + perf_output_put(handle, dump_size); + + /* Data. */ + sp = perf_user_stack_pointer(regs); + rem = __output_copy_user(handle, (void *) sp, dump_size); + dyn_size = dump_size - rem; + + perf_output_skip(handle, rem); + + /* Dynamic size. */ + perf_output_put(handle, dyn_size); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4024,6 +4147,28 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, nr); } } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + u64 abi = data->regs_user.abi; + + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_user; + perf_output_sample_regs(handle, + data->regs_user.regs, + mask); + } + } + + if (sample_type & PERF_SAMPLE_STACK_USER) + perf_output_sample_ustack(handle, + data->stack_user_size, + data->regs_user.regs); } void perf_prepare_sample(struct perf_event_header *header, @@ -4075,6 +4220,49 @@ void perf_prepare_sample(struct perf_event_header *header, } header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_user(&data->regs_user, regs); + + if (data->regs_user.regs) { + u64 mask = event->attr.sample_regs_user; + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } + + if (sample_type & PERF_SAMPLE_STACK_USER) { + /* + * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * processed as the last one or have additional check added + * in case new sample type is added, because we could eat + * up the rest of the sample size. + */ + struct perf_regs_user *uregs = &data->regs_user; + u16 stack_size = event->attr.sample_stack_user; + u16 size = sizeof(u64); + + if (!uregs->abi) + perf_sample_regs_user(uregs, regs); + + stack_size = perf_sample_ustack_size(stack_size, header->size, + uregs->regs); + + /* + * If there is something to dump, add space for the dump + * itself and for the field that tells the dynamic size, + * which is how many have been actually dumped. + */ + if (stack_size) + size += sizeof(u64) + stack_size; + + data->stack_user_size = stack_size; + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -4227,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4373,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4569,7 +4757,7 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -5670,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } } @@ -5806,7 +5994,7 @@ skip_type: cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; + cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -5967,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; @@ -6151,6 +6339,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } } + + if (attr->sample_type & PERF_SAMPLE_REGS_USER) { + ret = perf_reg_validate(attr->sample_regs_user); + if (ret) + return ret; + } + + if (attr->sample_type & PERF_SAMPLE_STACK_USER) { + if (!arch_perf_have_user_stack_dump()) + return -ENOSYS; + + /* + * We have __u32 type for the size, but so far + * we can only use __u16 as maximum due to the + * __u16 sample size limit. + */ + if (attr->sample_stack_user >= USHRT_MAX) + ret = -EINVAL; + else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) + ret = -EINVAL; + } + out: return ret; @@ -6229,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct file *group_file = NULL; + struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; - int fput_needed = 0; int err; /* for future expandability... */ @@ -6264,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd_flags(O_RDWR); + event_fd = get_unused_fd(); if (event_fd < 0) return event_fd; if (group_fd != -1) { - group_file = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_file)) { - err = PTR_ERR(group_file); + err = perf_fget_light(group_fd, &group); + if (err) goto err_fd; - } - group_leader = group_file->private_data; + group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6450,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open, * of the group leader will find the pointer to itself in * perf_group_detach(). */ - fput_light(group_file, fput_needed); + fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -6464,7 +6671,7 @@ err_task: if (task) put_task_struct(task); err_group_fd: - fput_light(group_file, fput_needed); + fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -7227,7 +7434,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) { struct perf_cgroup *jc; @@ -7244,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_destroy(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -7285,9 +7492,16 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, struct cgroup_subsys perf_subsys = { .name = "perf_event", .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, + .css_alloc = perf_cgroup_css_alloc, + .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, + + /* + * perf_event cgroup doesn't handle nesting correctly. + * ctx->nr_cgroups adjustments should be propagated through the + * cgroup hierarchy. Fix it and remove the following. + */ + .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9a7b487c6fe2..fe8a916507ed 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && + find_slot_idx(iter) == type && + cpu == iter->cpu) count += hw_breakpoint_weight(iter); } @@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(bp, type); + slots->pinned += task_bp_pinned(cpu, bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(bp, type); + nr += task_bp_pinned(cpu, bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(bp, type); + old_count = task_bp_pinned(cpu, bp, type); old_idx = old_count - 1; idx = old_idx + weight; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -2,6 +2,7 @@ #define _KERNEL_EVENTS_INTERNAL_H #include <linux/hardirq.h> +#include <linux/uaccess.h> /* Buffer handling */ @@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline void -__output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned int \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned int len) \ +{ \ + unsigned long size, written; \ + \ + do { \ + size = min_t(unsigned long, handle->size, len); \ + \ + written = memcpy_func(handle->addr, buf, size); \ + \ + len -= written; \ + handle->addr += written; \ + buf += written; \ + handle->size -= written; \ + if (!handle->size) { \ + struct ring_buffer *rb = handle->rb; \ + \ + handle->page++; \ + handle->page &= rb->nr_pages - 1; \ + handle->addr = rb->data_pages[handle->page]; \ + handle->size = PAGE_SIZE << page_order(rb); \ + } \ + } while (len && written == size); \ + \ + return len; \ +} + +static inline int memcpy_common(void *dst, const void *src, size_t n) { - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct ring_buffer *rb = handle->rb; - - handle->page++; - handle->page &= rb->nr_pages - 1; - handle->addr = rb->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(rb); - } - } while (len); + memcpy(dst, src, n); + return n; } +DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) + +#define MEMCPY_SKIP(dst, src, n) (n) + +DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) + +#ifndef arch_perf_out_copy_user +#define arch_perf_out_copy_user __copy_from_user_inatomic +#endif + +DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) + /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); @@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) recursion[rctx]--; } +#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP +static inline bool arch_perf_have_user_stack_dump(void) +{ + return true; +} + +#define perf_user_stack_pointer(regs) user_stack_pointer(regs) +#else +static inline bool arch_perf_have_user_stack_dump(void) +{ + return false; +} + +#define perf_user_stack_pointer(regs) 0 +#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ + #endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -182,10 +182,16 @@ out: return -ENOSPC; } -void perf_output_copy(struct perf_output_handle *handle, +unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - __output_copy(handle, buf, len); + return __output_copy(handle, buf, len); +} + +unsigned int perf_output_skip(struct perf_output_handle *handle, + unsigned int len) +{ + return __output_skip(handle, NULL, len); } void perf_output_end(struct perf_output_handle *handle) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..dea7acfbb071 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -33,6 +33,7 @@ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ +#include <linux/percpu-rwsem.h> #include <linux/uprobes.h> @@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static struct percpu_rw_semaphore dup_mmap_sem; + /* * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe * events active at this time. Probably a fine grained per inode count is @@ -78,15 +81,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); +/* Have a copy of original instruction */ +#define UPROBE_COPY_INSN 0 +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBE_RUN_HANDLER 1 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP 2 + struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; struct rw_semaphore consumer_rwsem; + struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; - int flags; + unsigned long flags; struct arch_uprobe arch; }; @@ -100,17 +111,12 @@ struct uprobe { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - if (!vma->vm_file) - return false; - - if (!is_register) - return true; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; - if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) - == (VM_READ|VM_EXEC)) - return true; + if (is_register) + flags |= VM_WRITE; - return false; + return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) @@ -141,10 +147,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; pte_t *ptep; int err; + /* For mmu_notifiers */ + const unsigned long mmun_start = addr; + const unsigned long mmun_end = addr + PAGE_SIZE; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) @@ -173,6 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; } @@ -188,19 +199,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } +static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +{ + void *kaddr = kmap_atomic(page); + memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + kunmap_atomic(kaddr); +} + +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +{ + uprobe_opcode_t old_opcode; + bool is_swbp; + + copy_opcode(page, vaddr, &old_opcode); + is_swbp = is_swbp_insn(&old_opcode); + + if (is_swbp_insn(new_opcode)) { + if (is_swbp) /* register: already installed? */ + return 0; + } else { + if (!is_swbp) /* unregister: was it changed by us? */ + return 0; + } + + return 1; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify read_opcode / + * supported by that architecture then we need to modify is_swbp_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ /* * write_opcode - write the opcode at a given virtual address. - * @auprobe: arch breakpointing information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -211,8 +247,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +static int write_opcode(struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t opcode) { struct page *old_page, *new_page; void *vaddr_old, *vaddr_new; @@ -221,10 +257,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; + ret = verify_opcode(old_page, vaddr, &opcode); + if (ret <= 0) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) @@ -259,65 +299,6 @@ put_old: } /** - * read_opcode - read the opcode at a given virtual address. - * @mm: the probed process address space. - * @vaddr: the virtual address to read the opcode. - * @opcode: location to store the read opcode. - * - * Called with mm->mmap_sem held (for read and with a reference to - * mm. - * - * For mm @mm, read the opcode at @vaddr and store it in @opcode. - * Return 0 (success) or a negative errno. - */ -static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) -{ - struct page *page; - void *vaddr_new; - int ret; - - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); - if (ret <= 0) - return ret; - - lock_page(page); - vaddr_new = kmap_atomic(page); - vaddr &= ~PAGE_MASK; - memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); - kunmap_atomic(vaddr_new); - unlock_page(page); - - put_page(page); - - return 0; -} - -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) -{ - uprobe_opcode_t opcode; - int result; - - if (current->mm == mm) { - pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); - pagefault_enable(); - - if (likely(result == 0)) - goto out; - } - - result = read_opcode(mm, vaddr, &opcode); - if (result) - return result; -out: - if (is_swbp_insn(&opcode)) - return 1; - - return 0; -} - -/** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. @@ -328,18 +309,7 @@ out: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - int result; - /* - * See the comment near uprobes_hash(). - */ - result = is_swbp_at_addr(mm, vaddr); - if (result == 1) - return -EEXIST; - - if (result) - return result; - - return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -347,25 +317,14 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. - * @verify: if true, verify existance of breakpoint instruction. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - if (verify) { - int result; - - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; - - if (result != 1) - return result; - } - return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -415,11 +374,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - unsigned long flags; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); return uprobe; } @@ -466,15 +424,14 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { - unsigned long flags; struct uprobe *u; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ - uprobe->flags |= UPROBE_SKIP_SSTEP; + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); return u; } @@ -496,6 +453,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); + mutex_init(&uprobe->copy_mutex); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -516,7 +474,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; down_read(&uprobe->consumer_rwsem); @@ -622,33 +580,48 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } -/* - * How mm->uprobes_state.count gets updated - * uprobe_mmap() increments the count if - * - it successfully adds a breakpoint. - * - it cannot add a breakpoint, but sees that there is a underlying - * breakpoint (via a is_swbp_at_addr()). - * - * uprobe_munmap() decrements the count if - * - it sees a underlying breakpoint, (via is_swbp_at_addr) - * (Subsequent uprobe_unregister wouldnt find the breakpoint - * unless a uprobe_mmap kicks in, since the old vma would be - * dropped just after uprobe_munmap.) - * - * uprobe_register increments the count if: - * - it successfully adds a breakpoint. - * - * uprobe_unregister decrements the count if: - * - it sees a underlying breakpoint and removes successfully. - * (via is_swbp_at_addr) - * (Subsequent uprobe_munmap wouldnt find the breakpoint - * since there is no underlying breakpoint after the - * breakpoint removal.) - */ +static int prepare_uprobe(struct uprobe *uprobe, struct file *file, + struct mm_struct *mm, unsigned long vaddr) +{ + int ret = 0; + + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) + return ret; + + mutex_lock(&uprobe->copy_mutex); + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) + goto out; + + ret = copy_insn(uprobe, file); + if (ret) + goto out; + + ret = -ENOTSUPP; + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + goto out; + + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); + if (ret) + goto out; + + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + + smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + set_bit(UPROBE_COPY_INSN, &uprobe->flags); + + out: + mutex_unlock(&uprobe->copy_mutex); + + return ret; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { + bool first_uprobe; int ret; /* @@ -659,48 +632,38 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence behave as if probe already existed. */ if (!uprobe->consumers) - return -EEXIST; - - if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma->vm_file); - if (ret) - return ret; - - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -ENOTSUPP; - - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); - if (ret) - return ret; - - /* write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + return 0; - uprobe->flags |= UPROBE_COPY_INSN; - } + ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); + if (ret) + return ret; /* - * Ideally, should be updating the probe count after the breakpoint - * has been successfully inserted. However a thread could hit the - * breakpoint we just inserted even before the probe count is - * incremented. If this is the first breakpoint placed, breakpoint - * notifier might ignore uprobes and pass the trap to the thread. - * Hence increment before and decrement on failure. + * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), + * the task can hit this breakpoint right after __replace_page(). */ - atomic_inc(&mm->uprobes_state.count); + first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + if (first_uprobe) + set_bit(MMF_HAS_UPROBES, &mm->flags); + ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret) - atomic_dec(&mm->uprobes_state.count); + if (!ret) + clear_bit(MMF_RECALC_UPROBES, &mm->flags); + else if (first_uprobe) + clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } -static void +static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) - atomic_dec(&mm->uprobes_state.count); + /* can happen if uprobe_register() fails */ + if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) + return 0; + + set_bit(MMF_RECALC_UPROBES, &mm->flags); + return set_orig_insn(&uprobe->arch, mm, vaddr); } /* @@ -710,11 +673,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { - unsigned long flags; - - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); atomic_dec(&uprobe_events); @@ -737,7 +698,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -746,7 +706,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; @@ -809,16 +769,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct map_info *info; int err = 0; + percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); - if (IS_ERR(info)) - return PTR_ERR(info); + if (IS_ERR(info)) { + err = PTR_ERR(info); + goto out; + } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - if (err) + if (err && is_register) goto free; down_write(&mm->mmap_sem); @@ -831,24 +794,19 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) { + if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); - /* - * We can race against uprobe_mmap(), see the - * comment near uprobe_hash(). - */ - if (err == -EEXIST) - err = 0; - } else { - remove_breakpoint(uprobe, mm, info->vaddr); - } + else + err |= remove_breakpoint(uprobe, mm, info->vaddr); + unlock: up_write(&mm->mmap_sem); free: mmput(mm); info = free_map_info(info); } - + out: + percpu_up_write(&dup_mmap_sem); return err; } @@ -897,18 +855,21 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - if (uprobe && !consumer_add(uprobe, uc)) { + if (!uprobe) { + ret = -ENOMEM; + } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { uprobe->consumers = NULL; __uprobe_unregister(uprobe); } else { - uprobe->flags |= UPROBE_RUN_HANDLER; + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } mutex_unlock(uprobes_hash(inode)); - put_uprobe(uprobe); + if (uprobe) + put_uprobe(uprobe); return ret; } @@ -935,7 +896,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (consumer_del(uprobe, uc)) { if (!uprobe->consumers) { __uprobe_unregister(uprobe); - uprobe->flags &= ~UPROBE_RUN_HANDLER; + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } @@ -978,7 +939,6 @@ static void build_probe_list(struct inode *inode, struct list_head *head) { loff_t min, max; - unsigned long flags; struct rb_node *n, *t; struct uprobe *u; @@ -986,7 +946,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1004,27 +964,20 @@ static void build_probe_list(struct inode *inode, atomic_inc(&u->ref); } } - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); } /* - * Called from mmap_region. - * called with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * - * Return -ve no if we fail to insert probes and we cannot - * bail-out. - * Return 0 otherwise. i.e: - * - * - successful insertion of probes - * - (or) no possible probes to be inserted. - * - (or) insertion of probes failed but we can bail-out. + * Currently we ignore all errors and always return 0, the callers + * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret, count; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1036,44 +989,35 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - ret = 0; - count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!ret) { + if (!fatal_signal_pending(current)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - - ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - /* - * We can race against uprobe_register(), see the - * comment near uprobe_hash(). - */ - if (ret == -EEXIST) { - ret = 0; - - if (!is_swbp_at_addr(vma->vm_mm, vaddr)) - continue; - - /* - * Unable to insert a breakpoint, but - * breakpoint lies underneath. Increment the - * probe count. - */ - atomic_inc(&vma->vm_mm->uprobes_state.count); - } - - if (!ret) - count++; + install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } - mutex_unlock(uprobes_mmap_hash(inode)); - if (ret) - atomic_sub(count, &vma->vm_mm->uprobes_state.count); + return 0; +} - return ret; +static bool +vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + loff_t min, max; + struct inode *inode; + struct rb_node *n; + + inode = vma->vm_file->f_mapping->host; + + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; + + spin_lock(&uprobes_treelock); + n = find_node_in_range(inode, min, max); + spin_unlock(&uprobes_treelock); + + return !!n; } /* @@ -1081,37 +1025,18 @@ int uprobe_mmap(struct vm_area_struct *vma) */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct list_head tmp_list; - struct uprobe *uprobe, *u; - struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!atomic_read(&vma->vm_mm->uprobes_state.count)) + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || + test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; - inode = vma->vm_file->f_mapping->host; - if (!inode) - return; - - mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, vma, start, end, &tmp_list); - - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); - put_uprobe(uprobe); - } - mutex_unlock(uprobes_mmap_hash(inode)); + if (vma_has_uprobes(vma, start, end)) + set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } /* Slot allocation for XOL */ @@ -1213,13 +1138,25 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } -/* - * uprobe_reset_state - Free the area allocated for slots. - */ -void uprobe_reset_state(struct mm_struct *mm) +void uprobe_start_dup_mmap(void) +{ + percpu_down_read(&dup_mmap_sem); +} + +void uprobe_end_dup_mmap(void) { - mm->uprobes_state.xol_area = NULL; - atomic_set(&mm->uprobes_state.count, 0); + percpu_up_read(&dup_mmap_sem); +} + +void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + newmm->uprobes_state.xol_area = NULL; + + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { + set_bit(MMF_HAS_UPROBES, &newmm->flags); + /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ + set_bit(MMF_RECALC_UPROBES, &newmm->flags); + } } /* @@ -1279,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on supported architectures too. + */ + flush_dcache_page(area->page); return current->utask->xol_vaddr; } @@ -1430,13 +1372,57 @@ bool uprobe_deny_signal(void) */ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - - uprobe->flags &= ~UPROBE_SKIP_SSTEP; + if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); + } return false; } +static void mmf_recalc_uprobes(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!valid_vma(vma, false)) + continue; + /* + * This is not strictly accurate, we can race with + * uprobe_unregister() and see the already removed + * uprobe if delete_uprobe() was not yet called. + */ + if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) + return; + } + + clear_bit(MMF_HAS_UPROBES, &mm->flags); +} + +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + uprobe_opcode_t opcode; + int result; + + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + + result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + if (result < 0) + return result; + + copy_opcode(page, vaddr, &opcode); + put_page(page); + out: + return is_swbp_insn(&opcode); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; @@ -1458,6 +1444,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } + + if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + mmf_recalc_uprobes(mm); up_read(&mm->mmap_sem); return uprobe; @@ -1494,41 +1483,41 @@ static void handle_swbp(struct pt_regs *regs) } return; } + /* + * TODO: move copy_insn/etc into _register and remove this hack. + * After we hit the bp, _unregister + _register can install the + * new and not-yet-analyzed uprobe at the same address, restart. + */ + smp_rmb(); /* pairs with wmb() in install_breakpoint() */ + if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) + goto restart; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto cleanup_ret; + goto restart; } - utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); - if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) - goto cleanup_ret; + if (can_skip_sstep(uprobe, regs)) + goto out; - utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { - user_enable_single_step(current); + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return; } -cleanup_ret: - if (utask) { - utask->active_uprobe = NULL; - utask->state = UTASK_RUNNING; - } - if (uprobe) { - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); - - put_uprobe(uprobe); - } +restart: + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); +out: + put_uprobe(uprobe); } /* @@ -1550,7 +1539,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - user_disable_single_step(current); xol_free_insn_slot(current); spin_lock_irq(¤t->sighand->siglock); @@ -1559,13 +1547,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) } /* - * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on - * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and - * allows the thread to return from interrupt. + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and + * allows the thread to return from interrupt. After that handle_swbp() + * sets utask->active_uprobe. * - * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and - * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from - * interrupt. + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag + * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). @@ -1574,11 +1561,13 @@ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; + clear_thread_flag(TIF_UPROBE); + utask = current->utask; - if (!utask || utask->state == UTASK_BP_HIT) - handle_swbp(regs); - else + if (utask && utask->active_uprobe) handle_singlestep(utask, regs); + else + handle_swbp(regs); } /* @@ -1587,18 +1576,10 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - struct uprobe_task *utask; - - if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) - /* task is currently not uprobed */ + if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; - utask = current->utask; - if (utask) - utask->state = UTASK_BP_HIT; - set_thread_flag(TIF_UPROBE); - return 1; } @@ -1633,6 +1614,9 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mmap_mutex[i]); } + if (percpu_init_rwsem(&dup_mmap_sem)) + return -ENOMEM; + return register_die_notifier(&uprobe_exception_nb); } module_init(init_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..b4df21937216 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } @@ -322,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); - write_unlock_irq(&tasklist_lock); -} - void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; @@ -370,13 +321,6 @@ void __set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - /* * Let kernel threads use this to say that they allow a certain signal. * Must not be used if kthread was cloned with CLONE_SIGHAND. @@ -416,149 +360,6 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to get frozen, in case system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - daemonize_fs_struct(); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); - for (;;) { - unsigned long set; - i = j * BITS_PER_LONG; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - rcu_read_unlock(); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - #ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. @@ -1046,6 +847,9 @@ void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + if (tsk->task_frag.page) + put_page(tsk->task_frag.page); + validate_creds_for_do_exit(tsk); preempt_disable(); @@ -1278,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..a31b823b3c2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; account_kernel_stack(ti, 1); @@ -351,8 +352,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; struct mempolicy *pol; + uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ @@ -421,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(tmp, + &mapping->i_mmap_nonlinear); + else + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -454,9 +462,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - - if (file) - uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); @@ -465,6 +470,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); @@ -623,26 +629,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { if (new_exe_file) @@ -650,15 +636,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) if (mm->exe_file) fput(mm->exe_file); mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; } struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ + /* We need mmap_sem to protect against races with removal of exe_file */ down_read(&mm->mmap_sem); exe_file = mm->exe_file; if (exe_file) @@ -839,8 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif - uprobe_reset_state(mm); - +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; +#endif if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1059,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1081,7 +1064,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_rwsem(&sig->group_rwsem); #endif - sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1148,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, @@ -1156,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1243,7 +1223,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); @@ -1280,11 +1260,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else p->hardirqs_enabled = 0; -#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -1345,7 +1321,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p); if (retval) goto bad_fork_cleanup_io; @@ -1418,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1466,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + if (is_child_reaper(pid)) { + ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); @@ -1501,8 +1473,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) @@ -1528,7 +1498,7 @@ bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); + cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: @@ -1540,12 +1510,6 @@ fork_out: return ERR_PTR(retval); } -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - return regs; -} - static inline void init_idle_pids(struct pid_link *links) { enum pid_type type; @@ -1559,10 +1523,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; - struct pt_regs regs; - - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1579,7 +1540,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) */ long do_fork(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) @@ -1592,15 +1552,9 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; } /* @@ -1609,7 +1563,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1621,7 +1575,7 @@ long do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, regs, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer @@ -1659,6 +1613,58 @@ long do_fork(unsigned long clone_flags, return nr; } +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, + (unsigned long)arg, NULL, NULL); +} + +#ifdef __ARCH_WANT_SYS_FORK +SYSCALL_DEFINE0(fork) +{ +#ifdef CONFIG_MMU + return do_fork(SIGCHLD, 0, 0, NULL, NULL); +#else + /* can not support in nommu mode */ + return(-EINVAL); +#endif +} +#endif + +#ifdef __ARCH_WANT_SYS_VFORK +SYSCALL_DEFINE0(vfork) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + 0, NULL, NULL); +} +#endif + +#ifdef __ARCH_WANT_SYS_CLONE +#ifdef CONFIG_CLONE_BACKWARDS +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int, tls_val, + int __user *, child_tidptr) +#elif defined(CONFIG_CLONE_BACKWARDS2) +SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#else +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#endif +{ + return do_fork(clone_flags, newsp, 0, + parent_tidptr, child_tidptr); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif @@ -1708,7 +1714,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1775,19 +1782,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1801,11 +1829,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1838,11 +1870,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) { + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); - /* - * fake_signal_wake_up() goes through p's scheduler - * lock and guarantees that TASK_STOPPED/TRACED -> - * TASK_RUNNING transition can't race with task state - * testing in try_to_freeze_tasks(). - */ - } else { + else wake_up_state(p, TASK_INTERRUPTIBLE); - } spin_unlock_irqrestore(&freezer_lock, flags); return true; diff --git a/kernel/futex.c b/kernel/futex.c index 3717e7b306e0..19eb089ca003 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, ownerdied = 0; + int lock_taken, ret, force_take = 0; u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: @@ -755,17 +755,15 @@ retry: newval = curval | FUTEX_WAITERS; /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! + * Should we force take the futex? See below. */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ + if (unlikely(force_take)) { + /* + * Keep the OWNER_DIED and the WAITERS bit and set the + * new TID value. + */ newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; + force_take = 0; lock_taken = 1; } @@ -775,7 +773,7 @@ retry: goto retry; /* - * We took the lock due to owner died take over. + * We took the lock due to forced take over. */ if (unlikely(lock_taken)) return 1; @@ -790,20 +788,25 @@ retry: switch (ret) { case -ESRCH: /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. + * We failed to find an owner for this + * futex. So we have no pi_state to block + * on. This can happen in two cases: + * + * 1) The owner died + * 2) A stale FUTEX_WAITERS bit + * + * Re-read the futex value. */ if (get_futex_value_locked(&curval, uaddr)) return -EFAULT; /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. + * If the owner died or we have a stale + * WAITERS bit the owner TID in the user space + * futex is 0. */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; + if (!(curval & FUTEX_TID_MASK)) { + force_take = 1; goto retry; } default: @@ -840,6 +843,9 @@ static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + /* * We set q->lock_ptr = NULL _before_ we wake up the task. If * a non-futex wake up happens on another CPU then the task @@ -1075,6 +1081,10 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++ret >= nr_wake) break; @@ -1087,6 +1097,10 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++op_ret >= nr_wake2) break; @@ -1095,6 +1109,7 @@ retry_private: ret += op_ret; } +out_unlock: double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(&key2); @@ -1384,9 +1399,13 @@ retry_private: /* * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { + (!requeue_pi && this->rt_waiter) || + this->pi_state) { ret = -EINVAL; break; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5cfb44..3aca9f29d30e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; @@ -671,6 +672,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_set_chip(irq, chip); __irq_set_handler(irq, handle, 0, name); } +EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index b5fcd96c7102..988dc58e8847 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -6,6 +6,7 @@ */ #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/export.h> #include "internals.h" @@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = { .irq_mask = noop, .irq_unmask = noop, }; +EXPORT_SYMBOL_GPL(dummy_irq_chip); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db42..96f3a1d9c379 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * @host_data: Controller private data pointer * * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated. * * This is intended to implement the expected behaviour for most * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) - return irq_domain_add_legacy(of_node, size, first_irq, 0, + if (first_irq > 0) { + int irq_base; + + if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { + /* + * Set the descriptor allocator to search for a + * 1-to-1 mapping, such as irq_alloc_desc_at(). + * Use of_node_to_nid() which is defined to + * numa_node_id() on platforms that have no custom + * implementation. + */ + irq_base = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (irq_base < 0) { + pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); + irq_base = first_irq; + } + } else + irq_base = first_irq; + + return irq_domain_add_legacy(of_node, size, irq_base, 0, ops, host_data); - else - return irq_domain_add_linear(of_node, size, ops, host_data); + } + + /* A linear domain is the default */ + return irq_domain_add_linear(of_node, size, ops, host_data); } /** diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c69326aa773..e49a288fa479 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +#ifdef CONFIG_HARDIRQS_SW_RESEND +int irq_set_parent(int irq, int parent_irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + + if (!desc) + return -EINVAL; + + desc->parent_irq = parent_irq; + + irq_put_desc_unlock(desc, flags); + return 0; +} +#endif + /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called @@ -716,6 +732,7 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; + bool valid = true; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->irq_data.affinity); + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (desc->irq_data.affinity) + cpumask_copy(mask, desc->irq_data.affinity); + else + valid = false; raw_spin_unlock_irq(&desc->lock); - set_cpus_allowed_ptr(current, mask); + if (valid) + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -793,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); @@ -833,6 +858,8 @@ static int irq_thread(void *data) init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); + irq_thread_check_affinity(desc, action); + while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; @@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; + /* + * Tell the thread to set its affinity. This is + * important for shared interrupt handlers as we do + * not invoke setup_affinity() for the secondary + * handlers as everything is already set up. Even for + * interrupts marked with IRQF_NO_BALANCE this is + * correct as we want the thread to move to the cpu(s) + * on which the requesting code placed the interrupt. + */ + set_bit(IRQTF_AFFINITY, &new->thread_flags); } if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 6454db7b6a4d..9065107f083e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND + /* + * If the interrupt has a parent irq and runs + * in the thread context of the parent irq, + * retrigger the parent. + */ + if (desc->parent_irq && + irq_settings_is_nested_thread(desc)) + irq = desc->parent_irq; /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); tasklet_schedule(&resend_tasklet); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } +EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 30b7b225306c..e30ac0fe61c3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -4,6 +4,7 @@ #include <linux/string.h> #include <linux/random.h> #include <linux/module.h> +#include <linux/ptrace.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/cache.h> diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <generated/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c6..0023a87e8de6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@ #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -218,14 +219,16 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); + if (!retval) + return 0; /* Exec failed? */ fail: sub_info->retval = retval; - return 0; + do_exit(0); } static int call_helper(void *data) @@ -292,7 +295,7 @@ static int wait_for_helper(void *data) } umh_complete(sub_info); - return 0; + do_exit(0); } /* This is run by khelper thread */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) { LIST_HEAD(free_list); + mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(&free_list); - mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) @@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* Impossible to optimize ftrace-based kprobe */ + if (kprobe_ftrace(p)) + return; + + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + mutex_lock(&text_mutex); + ap = alloc_aggr_kprobe(p); if (!ap) - return; + goto out; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ arch_remove_optimized_kprobe(op); kfree(op); - return; + goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); + optimize_kprobe(ap); /* This just kicks optimizer thread */ + +out: + mutex_unlock(&text_mutex); + jump_label_unlock(); } #ifdef CONFIG_SYSCTL @@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ +#ifdef KPROBES_CAN_USE_FTRACE +static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { + .func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static int kprobe_ftrace_enabled; + +/* Must ensure p->addr is really on ftrace */ +static int __kprobes prepare_kprobe(struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + return arch_prepare_kprobe(p); + + return arch_prepare_kprobe_ftrace(p); +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 0, 0); + WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + kprobe_ftrace_enabled++; + if (kprobe_ftrace_enabled == 1) { + ret = register_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + kprobe_ftrace_enabled--; + if (kprobe_ftrace_enabled == 0) { + ret = unregister_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 1, 0); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); +} +#else /* !KPROBES_CAN_USE_FTRACE */ +#define prepare_kprobe(p) arch_prepare_kprobe(p) +#define arm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) do {} while (0) +#endif + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + if (unlikely(kprobe_ftrace(kp))) { + arm_kprobe_ftrace(kp); + return; + } /* * Here, since __arm_kprobe() doesn't use stop_machine(), * this doesn't cause deadlock on text_mutex. So, we don't @@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp) +static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) { + if (unlikely(kprobe_ftrace(kp))) { + disarm_kprobe_ftrace(kp); + return; + } /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp, true); + __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); } @@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; - if (kprobe_disabled(ap) && !kprobe_disabled(p)) { - ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) - /* Arm the breakpoint again. */ - __arm_kprobe(ap); - } return 0; } @@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, int ret = 0; struct kprobe *ap = orig_p; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + /* + * Get online CPUs to avoid text_mutex deadlock.with stop machine, + * which is invoked by unoptimize_kprobe() in add_new_kprobe() + */ + get_online_cpus(); + mutex_lock(&text_mutex); + if (!kprobe_aggrprobe(orig_p)) { /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ ap = alloc_aggr_kprobe(orig_p); - if (!ap) - return -ENOMEM; + if (!ap) { + ret = -ENOMEM; + goto out; + } init_aggr_kprobe(ap, orig_p); } else if (kprobe_unused(ap)) /* This probe is going to die. Rescue it */ @@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, * free aggr_probe. It will be used next time, or * freed by unregister_kprobe. */ - return ret; + goto out; /* Prepare optimized instructions if possible. */ prepare_optimized_kprobe(ap); @@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, /* Copy ap's insn slot to p */ copy_kprobe(ap, p); - return add_new_kprobe(ap, p); + ret = add_new_kprobe(ap, p); + +out: + mutex_unlock(&text_mutex); + put_online_cpus(); + jump_label_unlock(); + + if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + arm_kprobe(ap); + } + return ret; } static int __kprobes in_kprobes_functions(unsigned long addr) @@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -int __kprobes register_kprobe(struct kprobe *p) +static __kprobes int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (IS_ERR(addr)) - return PTR_ERR(addr); - p->addr = addr; + unsigned long ftrace_addr; - ret = check_kprobe_rereg(p); - if (ret) - return ret; + /* + * If the address is located on a ftrace nop, set the + * breakpoint to the following instruction. + */ + ftrace_addr = ftrace_location((unsigned long)p->addr); + if (ftrace_addr) { +#ifdef KPROBES_CAN_USE_FTRACE + /* Given address is not on the instruction boundary */ + if ((unsigned long)p->addr != ftrace_addr) + return -EILSEQ; + p->flags |= KPROBE_FLAG_FTRACE; +#else /* !KPROBES_CAN_USE_FTRACE */ + return -EINVAL; +#endif + } jump_label_lock(); preempt_disable(); + + /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; - goto cannot_probe; + goto out; } - /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ - p->flags &= KPROBE_FLAG_DISABLED; - - /* - * Check if are we probing a module. - */ - probed_mod = __module_text_address((unsigned long) p->addr); - if (probed_mod) { - /* Return -ENOENT if fail. */ - ret = -ENOENT; + /* Check if are we probing a module */ + *probed_mod = __module_text_address((unsigned long) p->addr); + if (*probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) - goto cannot_probe; + if (unlikely(!try_module_get(*probed_mod))) { + ret = -ENOENT; + goto out; + } /* * If the module freed .init.text, we couldn't insert * kprobes in there. */ - if (within_module_init((unsigned long)p->addr, probed_mod) && - probed_mod->state != MODULE_STATE_COMING) { - module_put(probed_mod); - goto cannot_probe; + if (within_module_init((unsigned long)p->addr, *probed_mod) && + (*probed_mod)->state != MODULE_STATE_COMING) { + module_put(*probed_mod); + *probed_mod = NULL; + ret = -ENOENT; } - /* ret will be updated by following code */ } +out: preempt_enable(); jump_label_unlock(); + return ret; +} + +int __kprobes register_kprobe(struct kprobe *p) +{ + int ret; + struct kprobe *old_p; + struct module *probed_mod; + kprobe_opcode_t *addr; + + /* Adjust probe address from symbol */ + addr = kprobe_addr(p); + if (IS_ERR(addr)) + return PTR_ERR(addr); + p->addr = addr; + + ret = check_kprobe_rereg(p); + if (ret) + return ret; + + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; p->nmissed = 0; INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); - jump_label_lock(); /* needed to call jump_label_text_reserved() */ + ret = check_kprobe_address_safe(p, &probed_mod); + if (ret) + return ret; - get_online_cpus(); /* For avoiding text_mutex deadlock. */ - mutex_lock(&text_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p) goto out; } - ret = arch_prepare_kprobe(p); + mutex_lock(&text_mutex); /* Avoiding text modification */ + ret = prepare_kprobe(p); + mutex_unlock(&text_mutex); if (ret) goto out; @@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p) &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); /* Try to optimize kprobe */ try_to_optimize_kprobe(p); out: - mutex_unlock(&text_mutex); - put_online_cpus(); - jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) module_put(probed_mod); return ret; - -cannot_probe: - preempt_enable(); - jump_label_unlock(); - return ret; } EXPORT_SYMBOL_GPL(register_kprobe); @@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p); + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, if (!pp) pp = p; - seq_printf(pi, "%s%s%s\n", + seq_printf(pi, "%s%s%s%s\n", (kprobe_gone(p) ? "[GONE]" : ""), ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), - (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); + (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), + (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void) goto already_enabled; /* Arming kprobes doesn't optimize kprobe itself */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); } - mutex_unlock(&text_mutex); kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); @@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p, false); + disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..6ada93c23a9a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(uevent_helper); -#endif + #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, @@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +int rcu_expedited; +static ssize_t rcu_expedited_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", rcu_expedited); +} +static ssize_t rcu_expedited_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_expedited)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_expedited); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, -#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, -#endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif @@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif + &rcu_expedited_attr.attr, NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea10..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> +#include <linux/ptrace.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -37,11 +38,20 @@ struct kthread_create_info }; struct kthread { - int should_stop; + unsigned long flags; + unsigned int cpu; void *data; + struct completion parked; struct completion exited; }; +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + #define to_kthread(tsk) \ container_of((tsk)->vfork_done, struct kthread, exited) @@ -52,13 +62,29 @@ struct kthread { * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ -int kthread_should_stop(void) +bool kthread_should_stop(void) { - return to_kthread(current)->should_stop; + return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_should_park - should this kthread park now? + * + * When someone calls kthread_park() on your kthread, it will be woken + * and this will return true. You should then do the necessary + * cleanup and call kthread_parkme() + * + * Similar to kthread_should_stop(), but this keeps the thread alive + * and in a park position. kthread_unpark() "restarts" the thread and + * calls the thread function again. + */ +bool kthread_should_park(void) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); +} + +/** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * @@ -96,6 +122,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +static void __kthread_parkme(struct kthread *self) +{ + __set_current_state(TASK_INTERRUPTIBLE); + while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { + if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) + complete(&self->parked); + schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + } + clear_bit(KTHREAD_IS_PARKED, &self->flags); + __set_current_state(TASK_RUNNING); +} + +void kthread_parkme(void) +{ + __kthread_parkme(to_kthread(current)); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -105,9 +149,10 @@ static int kthread(void *_create) struct kthread self; int ret; - self.should_stop = 0; + self.flags = 0; self.data = data; init_completion(&self.exited); + init_completion(&self.parked); current->vfork_done = &self.exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -117,9 +162,11 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); + if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { + __kthread_parkme(&self); + ret = threadfn(data); + } /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -172,8 +219,7 @@ static void create_kthread(struct kthread_create_info *create) * Returns a task_struct or ERR_PTR(-ENOMEM). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, + void *data, int node, const char namefmt[], ...) { @@ -210,6 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); +static void __kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} + /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). @@ -226,14 +279,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) WARN_ON(1); return; } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + __kthread_bind(p, cpu); } EXPORT_SYMBOL(kthread_bind); /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: This helper function creates and names a kernel thread + * The thread will be woken and put into park mode. + */ +struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + void *data, unsigned int cpu, + const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + cpu); + if (IS_ERR(p)) + return p; + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; + /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ + kthread_park(p); + return p; +} + +static struct kthread *task_get_live_kthread(struct task_struct *k) +{ + struct kthread *kthread; + + get_task_struct(k); + kthread = to_kthread(k); + /* It might have exited */ + barrier(); + if (k->vfork_done != NULL) + return kthread; + return NULL; +} + +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + + if (kthread) { + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu); + wake_up_process(k); + } + } + put_task_struct(k); +} + +/** + * kthread_park - park a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return true, wakes it, and + * waits for it to return. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will park without + * calling threadfn(). + * + * Returns 0 if the thread is parked, -ENOSYS if the thread exited. + * If called by the kthread itself just the park bit is set. + */ +int kthread_park(struct task_struct *k) +{ + struct kthread *kthread = task_get_live_kthread(k); + int ret = -ENOSYS; + + if (kthread) { + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); + } + } + ret = 0; + } + put_task_struct(k); + return ret; +} + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * @@ -250,16 +401,13 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread; + struct kthread *kthread = task_get_live_kthread(k); int ret; trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; + if (kthread) { + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_process(k); wait_for_completion(&kthread->exited); } @@ -280,7 +428,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_states[N_HIGH_MEMORY]); + set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee4518c35..7981e5b2350d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +static int +print_lock_nested_lock_not_held(struct task_struct *curr, + struct held_lock *hlock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("==================================\n"); + printk("[ BUG: Nested lock was not taken ]\n"); + print_kernel_ident(); + printk("----------------------------------\n"); + + printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + printk("\nbut this task is not holding:\n"); + printk("%s\n", hlock->nest_lock->name); + + printk("\nstack backtrace:\n"); + dump_stack(); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int __lock_is_held(struct lockdep_map *lock); + /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: @@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, id); + if (nest_lock && !__lock_is_held(nest_lock)) + return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 91c32a0b612c..b2c71c5873e4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v) static void print_name(struct seq_file *m, struct lock_class *class) { - char str[128]; + char str[KSYM_NAME_LEN]; const char *name = class->name; if (!name) { diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/kernel/modsign_certificate.S @@ -0,0 +1,19 @@ +/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ +#ifndef SYMBOL_PREFIX +#define ASM_SYMBOL(sym) sym +#else +#define PASTE2(x,y) x##y +#define PASTE(x,y) PASTE2(x,y) +#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) +#endif + +#define GLOBAL(name) \ + .globl ASM_SYMBOL(name); \ + ASM_SYMBOL(name): + + .section ".init.data","aw" + +GLOBAL(modsign_certificate_list) + .incbin "signing_key.x509" + .incbin "extra_certificates" +GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 000000000000..2b6e69909c39 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,104 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initdata const u8 modsign_certificate_list[]; +extern __initdata const u8 modsign_certificate_list_end[]; + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initdata const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ + pr_notice("Initialise module verification\n"); + + modsign_keyring = keyring_alloc(".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(modsign_keyring)) + panic("Can't allocate module signing keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading module verification certificates\n"); + + end = modsign_certificate_list_end; + p = modsign_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(modsign_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) + pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + else + pr_notice("MODSIGN: Loaded cert '%s'\n", + key_ref_to_ptr(key)->description); + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 000000000000..24f9247b7d02 --- /dev/null +++ b/kernel/module-internal.h @@ -0,0 +1,14 @@ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +extern struct key *modsign_keyring; + +extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11aca..250092c1d57d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -21,6 +21,7 @@ #include <linux/ftrace_event.h> #include <linux/init.h> #include <linux/kallsyms.h> +#include <linux/file.h> #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernel.h> @@ -28,6 +29,7 @@ #include <linux/vmalloc.h> #include <linux/elf.h> #include <linux/proc_fs.h> +#include <linux/security.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/fcntl.h> @@ -58,6 +60,9 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> +#include <linux/fips.h> +#include <uapi/linux/module.h> +#include "module-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -102,6 +107,43 @@ static LIST_HEAD(modules); struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ #endif /* CONFIG_KGDB_KDB */ +#ifdef CONFIG_MODULE_SIG +#ifdef CONFIG_MODULE_SIG_FORCE +static bool sig_enforce = true; +#else +static bool sig_enforce = false; + +static int param_set_bool_enable_only(const char *val, + const struct kernel_param *kp) +{ + int err; + bool test; + struct kernel_param dummy_kp = *kp; + + dummy_kp.arg = &test; + + err = param_set_bool(val, &dummy_kp); + if (err) + return err; + + /* Don't let them unset it once it's set! */ + if (!test && sig_enforce) + return -EROFS; + + if (test) + sig_enforce = true; + return 0; +} + +static const struct kernel_param_ops param_ops_bool_enable_only = { + .set = param_set_bool_enable_only, + .get = param_get_bool, +}; +#define param_check_bool_enable_only param_check_bool + +module_param(sig_enforce, bool_enable_only, 0644); +#endif /* !CONFIG_MODULE_SIG_FORCE */ +#endif /* CONFIG_MODULE_SIG */ /* Block module loading/unloading? */ int modules_disabled = 0; @@ -136,6 +178,7 @@ struct load_info { unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; + bool sig_ok; struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -332,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms, printk(KERN_WARNING "Symbol %s is being used " "by a non-GPL module, which will not " "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); } } @@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) return ret; } -int __weak apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: REL relocation unsupported\n", me->name); - return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: RELA relocation unsupported\n", me->name); - return -ENOEXEC; -} - static int apply_relocations(struct module *mod, const struct load_info *info) { unsigned int i; @@ -2262,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size; + unsigned int i, nsrc, ndst, strtab_size = 0; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2274,11 +2294,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) nsrc = symsect->sh_size / sizeof(*src); /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - strtab_size += strlen(&info->strtab[src->st_name]) + 1; + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } + } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); @@ -2312,15 +2334,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *dst = *src; - *s++ = 0; - for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) - continue; - - dst[ndst] = *src; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; + for (ndst = i = 0; i < mod->num_symtab; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } } mod->core_num_syms = ndst; } @@ -2353,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return size == 0 ? NULL : vmalloc_exec(size); + return vmalloc_exec(size); } static void *module_alloc_update_bounds(unsigned long size) @@ -2399,48 +2420,136 @@ static inline void kmemleak_load_module(const struct module *mod, } #endif +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info) +{ + int err = -ENOKEY; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const void *mod = info->hdr; + + if (info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + info->len -= markerlen; + err = mod_verify_sig(mod, &info->len); + } + + if (!err) { + info->sig_ok = true; + return 0; + } + + /* Not having a signature is only an error if we're strict. */ + if (err < 0 && fips_enabled) + panic("Module verification failed with error %d in FIPS mode\n", + err); + if (err == -ENOKEY && !sig_enforce) + err = 0; + + return err; +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +/* Sanity checks against invalid binaries, wrong arch, weird elf version. */ +static int elf_header_check(struct load_info *info) +{ + if (info->len < sizeof(*(info->hdr))) + return -ENOEXEC; + + if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 + || info->hdr->e_type != ET_REL + || !elf_check_arch(info->hdr) + || info->hdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (info->hdr->e_shoff >= info->len + || (info->hdr->e_shnum * sizeof(Elf_Shdr) > + info->len - info->hdr->e_shoff)) + return -ENOEXEC; + + return 0; +} + /* Sets info->hdr and info->len. */ -static int copy_and_check(struct load_info *info, - const void __user *umod, unsigned long len, - const char __user *uargs) +static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) { int err; - Elf_Ehdr *hdr; - if (len < sizeof(*hdr)) + info->len = len; + if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; + err = security_kernel_module_from_file(NULL); + if (err) + return err; + /* Suck in entire file: we'll want most of it. */ - if ((hdr = vmalloc(len)) == NULL) + info->hdr = vmalloc(info->len); + if (!info->hdr) return -ENOMEM; - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; + if (copy_from_user(info->hdr, umod, info->len) != 0) { + vfree(info->hdr); + return -EFAULT; } - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; - } + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_fd(int fd, struct load_info *info) +{ + struct file *file; + int err; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + file = fget(fd); + if (!file) + return -ENOEXEC; - if (hdr->e_shoff >= len || - hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { - err = -ENOEXEC; - goto free_hdr; + err = security_kernel_module_from_file(file); + if (err) + goto out; + + err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); + if (err) + goto out; + + if (stat.size > INT_MAX) { + err = -EFBIG; + goto out; + } + info->hdr = vmalloc(stat.size); + if (!info->hdr) { + err = -ENOMEM; + goto out; } - info->hdr = hdr; - info->len = len; - return 0; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(info->hdr); + err = bytes; + goto out; + } + if (bytes == 0) + break; + pos += bytes; + } + info->len = pos; -free_hdr: - vfree(hdr); +out: + fput(file); return err; } @@ -2449,7 +2558,7 @@ static void free_copy(struct load_info *info) vfree(info->hdr); } -static int rewrite_section_headers(struct load_info *info) +static int rewrite_section_headers(struct load_info *info, int flags) { unsigned int i; @@ -2477,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info) } /* Track but don't keep modinfo and version sections. */ - info->index.vers = find_sec(info, "__versions"); + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.info = find_sec(info, ".modinfo"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -2492,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info) * Return the temporary module pointer (we'll replace it with the final * one when we move the module sections around). */ -static struct module *setup_load_info(struct load_info *info) +static struct module *setup_load_info(struct load_info *info, int flags) { unsigned int i; int err; @@ -2503,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info) info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info); + err = rewrite_section_headers(info, flags); if (err) return ERR_PTR(err); @@ -2541,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info) return mod; } -static int check_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo(struct module *mod, struct load_info *info, int flags) { const char *modmagic = get_modinfo(info, "vermagic"); int err; + if (flags & MODULE_INIT_IGNORE_VERMAGIC) + modmagic = NULL; + /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); @@ -2675,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info) memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc_update_bounds(mod->init_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr && mod->init_size) { - module_free(mod, mod->module_core); - return -ENOMEM; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + if (mod->init_size) { + ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr) { + module_free(mod, mod->module_core); + return -ENOMEM; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + } else + mod->module_init = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -2730,6 +2848,10 @@ static int check_module_license_and_versions(struct module *mod) if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2777,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } -static struct module *layout_and_allocate(struct load_info *info) +static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; Elf_Shdr *pcpusec; int err; - mod = setup_load_info(info); + mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; - err = check_modinfo(mod, info); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -2861,31 +2983,142 @@ static int post_relocation(struct module *mod, const struct load_info *info) return module_finalize(info->hdr, info->sechdrs, mod); } +/* Is this module of this name done loading? No locks held. */ +static bool finished_loading(const char *name) +{ + struct module *mod; + bool ret; + + mutex_lock(&module_mutex); + mod = find_module(name); + ret = !mod || mod->state != MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + return ret; +} + +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* This is where the real work happens */ +static int do_init_module(struct module *mod) +{ + int ret = 0; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; + } + if (ret > 0) { + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } + + /* Now it's a first class citizen! */ + mod->state = MODULE_STATE_LIVE; + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_ro_size = 0; + mod->init_text_size = 0; + mutex_unlock(&module_mutex); + wake_up_all(&module_wq); + + return 0; +} + +static int may_init_module(void) +{ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +static int load_module(struct load_info *info, const char __user *uargs, + int flags) { - struct load_info info = { NULL, }; - struct module *mod; + struct module *mod, *old; long err; - pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); + err = module_sig_check(info); + if (err) + goto free_copy; - /* Copy in the blobs from userspace, check they are vaguely sane. */ - err = copy_and_check(&info, umod, len, uargs); + err = elf_header_check(info); if (err) - return ERR_PTR(err); + goto free_copy; /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(&info); + mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { err = PTR_ERR(mod); goto free_copy; } +#ifdef CONFIG_MODULE_SIG + mod->sig_ok = info->sig_ok; + if (!mod->sig_ok) + add_taint_module(mod, TAINT_FORCED_MODULE); +#endif + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) @@ -2893,25 +3126,25 @@ static struct module *load_module(void __user *umod, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, &info); + find_module_sections(mod, info); err = check_module_license_and_versions(mod); if (err) goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, &info); + setup_modinfo(mod, info); /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, &info); + err = simplify_symbols(mod, info); if (err < 0) goto free_modinfo; - err = apply_relocations(mod, &info); + err = apply_relocations(mod, info); if (err < 0) goto free_modinfo; - err = post_relocation(mod, &info); + err = post_relocation(mod, info); if (err < 0) goto free_modinfo; @@ -2934,21 +3167,31 @@ static struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ +again: mutex_lock(&module_mutex); - if (find_module(mod->name)) { + if ((old = find_module(mod->name)) != NULL) { + if (old->state == MODULE_STATE_COMING) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_arch_cleanup; + goto again; + } err = -EEXIST; goto unlock; } /* This has to be done once we're sure module name is unique. */ - dynamic_debug_setup(info.debug, info.num_debug); + dynamic_debug_setup(info->debug, info->num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) goto ddebug; - module_bug_finalize(info.hdr, info.sechdrs, mod); + module_bug_finalize(info->hdr, info->sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2959,25 +3202,26 @@ static struct module *load_module(void __user *umod, goto unlink; /* Link in to syfs. */ - err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); + err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto unlink; /* Get rid of temporary copy. */ - free_copy(&info); + free_copy(info); /* Done! */ trace_module_load(mod); - return mod; + + return do_init_module(mod); unlink: mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); module_bug_cleanup(mod); - + wake_up_all(&module_wq); ddebug: - dynamic_debug_remove(info.debug); + dynamic_debug_remove(info->debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); @@ -2989,106 +3233,52 @@ static struct module *load_module(void __user *umod, free_unload: module_unload_free(mod); free_module: - module_deallocate(mod, &info); + module_deallocate(mod, info); free_copy: - free_copy(&info); - return ERR_PTR(err); -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif + free_copy(info); + return err; } -/* This is where the real work happens */ SYSCALL_DEFINE3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs) { - struct module *mod; - int ret = 0; + int err; + struct load_info info = { }; - /* Must have permission */ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; + err = may_init_module(); + if (err) + return err; - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) - return PTR_ERR(mod); + pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); + err = copy_module_from_user(umod, len, &info); + if (err) + return err; - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); + return load_module(&info, uargs, 0); +} - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + int err; + struct load_info info = { }; - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } + err = may_init_module(); + if (err) + return err; - /* Now it's a first class citizen! Wake up anyone waiting for it. */ - mod->state = MODULE_STATE_LIVE; - wake_up(&module_wq); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC)) + return -EINVAL; - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; -#endif - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); + err = copy_module_from_fd(fd, &info); + if (err) + return err; - return 0; + return load_module(&info, uargs, flags); } static inline int within(unsigned long addr, void *start, unsigned long size) diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 000000000000..f2970bddc5ea --- /dev/null +++ b/kernel/module_signing.c @@ -0,0 +1,249 @@ +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/err.h> +#include <crypto/public_key.h> +#include <crypto/hash.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ + u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 id_type; /* Key identifier type [enum pkey_id_type] */ + u8 signer_len; /* Length of signer's name */ + u8 key_id_len; /* Length of key identifier */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +/* + * Digest the module contents. + */ +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, + const void *mod, + unsigned long modlen) +{ + struct public_key_signature *pks; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + pr_devel("==>%s()\n", __func__); + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of our + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error_no_pks; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (void *)pks + sizeof(*pks); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, mod, modlen, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + pr_devel("<==%s() = ok\n", __func__); + return pks; + +error: + kfree(pks); +error_no_pks: + crypto_free_shash(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ERR_PTR(ret); +} + +/* + * Extract an MPI array from the signature data. This represents the actual + * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the + * size of the MPI in bytes. + * + * RSA signatures only have one MPI, so currently we only read one. + */ +static int mod_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + size_t nbytes; + MPI mpi; + + if (len < 3) + return -EBADMSG; + nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; + data += 2; + len -= 2; + if (len != nbytes) + return -EBADMSG; + + mpi = mpi_read_raw_data(data, nbytes); + if (!mpi) + return -ENOMEM; + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +/* + * Request an asymmetric key. + */ +static struct key *request_asymmetric_key(const char *signer, size_t signer_len, + const u8 *key_id, size_t key_id_len) +{ + key_ref_t key; + size_t i; + char *id, *q; + + pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); + + /* Construct an identifier. */ + id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); + if (!id) + return ERR_PTR(-ENOKEY); + + memcpy(id, signer, signer_len); + + q = id + signer_len; + *q++ = ':'; + *q++ = ' '; + for (i = 0; i < key_id_len; i++) { + *q++ = hex_asc[*key_id >> 4]; + *q++ = hex_asc[*key_id++ & 0x0f]; + } + + *q = 0; + + pr_debug("Look up: \"%s\"\n", id); + + key = keyring_search(make_key_ref(modsign_keyring, 1), + &key_type_asymmetric, id); + if (IS_ERR(key)) + pr_warn("Request for unknown module key '%s' err %ld\n", + id, PTR_ERR(key)); + kfree(id); + + if (IS_ERR(key)) { + switch (PTR_ERR(key)) { + /* Hide some search errors */ + case -EACCES: + case -ENOTDIR: + case -EAGAIN: + return ERR_PTR(-ENOKEY); + default: + return ERR_CAST(key); + } + } + + pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); + return key_ref_to_ptr(key); +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, unsigned long *_modlen) +{ + struct public_key_signature *pks; + struct module_signature ms; + struct key *key; + const void *sig; + size_t modlen = *_modlen, sig_len; + int ret; + + pr_devel("==>%s(,%zu)\n", __func__, modlen); + + if (modlen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + modlen -= sizeof(ms); + + sig_len = be32_to_cpu(ms.sig_len); + if (sig_len >= modlen) + return -EBADMSG; + modlen -= sig_len; + if ((size_t)ms.signer_len + ms.key_id_len >= modlen) + return -EBADMSG; + modlen -= (size_t)ms.signer_len + ms.key_id_len; + + *_modlen = modlen; + sig = mod + modlen; + + /* For the moment, only support RSA and X.509 identifiers */ + if (ms.algo != PKEY_ALGO_RSA || + ms.id_type != PKEY_ID_X509) + return -ENOPKG; + + if (ms.hash >= PKEY_HASH__LAST || + !pkey_hash_algo[ms.hash]) + return -ENOPKG; + + key = request_asymmetric_key(sig, ms.signer_len, + sig + ms.signer_len, ms.key_id_len); + if (IS_ERR(key)) + return PTR_ERR(key); + + pks = mod_make_digest(ms.hash, mod, modlen); + if (IS_ERR(pks)) { + ret = PTR_ERR(pks); + goto error_put_key; + } + + ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, + sig_len); + if (ret < 0) + goto error_free_pks; + + ret = verify_signature(key, pks); + pr_devel("verify_signature() = %d\n", ret); + +error_free_pks: + mpi_free(pks->rsa.s); + kfree(pks); +error_put_key: + key_put(key); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b576f7f14bc6..78e2ecb20165 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) + struct task_struct *tsk, struct user_namespace *user_ns, + struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk); + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk); + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -122,6 +123,7 @@ out_ns: int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, tsk->fs); + new_ns = create_new_namespaces(flags, tsk, + task_cred_xxx(tsk, user_ns), tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { + struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; @@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); @@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; unsigned int next_nr, next_index; - struct padata_parallel_queue *queue, *next_queue; + struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) goto out; } - queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); - if (queue->cpu_index == next_queue->cpu_index) { + if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; } diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad834..36aa02ff17d6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,8 +1,8 @@ /* * Generic pidhash and scalable, time-bounded PID allocator * - * (C) 2002-2003 William Irwin, IBM - * (C) 2004 William Irwin, Oracle + * (C) 2002-2003 Nadia Yvette Chambers, IBM + * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain @@ -36,6 +36,7 @@ #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> +#include <linux/proc_fs.h> #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). @@ -269,8 +257,24 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; + hlist_del_rcu(&upid->pid_chain); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + ns->nr_hashed = -1; + schedule_work(&ns->proc_work); + break; + } + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + if (ns->nr_hashed < 0) + goto out_unlock; + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: return pid; +out_unlock: + spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -479,10 +494,11 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) } return nr; } +EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -493,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; @@ -568,6 +584,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6144bab8fd8e..fdbd0cdf271a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -10,12 +10,14 @@ #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/reboot.h> +#include <linux/export.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -70,12 +72,29 @@ err_alloc: return NULL; } -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, + struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; @@ -88,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); + ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -98,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -118,32 +137,43 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) + if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); + return create_pid_namespace(user_ns, old_ns); } -void free_pid_ns(struct kref *kref) +static void free_pid_ns(struct kref *kref) { - struct pid_namespace *ns, *parent; + struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; destroy_pid_namespace(ns); +} - if (parent != NULL) - put_pid_ns(parent); +void put_pid_ns(struct pid_namespace *ns) +{ + struct pid_namespace *parent; + + while (ns != &init_pid_ns) { + parent = ns->parent; + if (!kref_put(&ns->kref, free_pid_ns)) + break; + ns = parent; + } } +EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { @@ -192,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -220,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !capable(CAP_SYS_ADMIN)) + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -231,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = ¤t->nsproxy->pid_ns->last_pid; + tmp.data = &pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } @@ -280,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } +static void *pidns_get(struct task_struct *task) +{ + struct pid_namespace *ns; + + rcu_read_lock(); + ns = get_pid_ns(task_active_pid_ns(task)); + rcu_read_unlock(); + + return ns; +} + +static void pidns_put(void *ns) +{ + put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *ancestor, *new = ns; + + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Only allow entering the current active pid namespace + * or a child of the current active pid namespace. + * + * This is required for fork to return a usable pid value and + * this maintains the property that processes and their + * children can not escape their current pid namespace. + */ + if (new->level < active->level) + return -EINVAL; + + ancestor = new; + while (ancestor->level > active->level) + ancestor = ancestor->parent; + if (ancestor != active) + return -EINVAL; + + put_pid_ns(nsproxy->pid_ns); + nsproxy->pid_ns = get_pid_ns(new); + return 0; +} + +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + +const struct proc_ns_operations pidns_operations = { + .name = "pid", + .type = CLONE_NEWPID, + .get = pidns_get, + .put = pidns_put, + .install = pidns_install, + .inum = pidns_inum, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..a278cad1d5d6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,6 +9,7 @@ #include <asm/uaccess.h> #include <linux/kernel_stat.h> #include <trace/events/timer.h> +#include <linux/random.h> /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -217,30 +218,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { if (b->utime > a->utime) @@ -494,6 +471,8 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers, tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82f..5dfdc9ea180b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS bool depends on PM +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS diff --git a/kernel/power/main.c b/kernel/power/main.c index f458238109cc..1c16f9167de1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, { unsigned long val; - if (strict_strtoul(buf, 10, &val)) + if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e85..68197a4e8fc9 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void) { register_sysrq_key('o', &sysrq_poweroff_op); return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f67558..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - * - * Because freeze_task() goes through p's scheduler lock, it's - * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING - * transition can't race with task state testing here. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) + if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -79,7 +68,7 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the regrigerator. + * time to enter the refrigerator. */ msleep(10); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..9322ff7eaad6 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) default: /* runtime check for not using enum */ BUG(); + return PM_QOS_DEFAULT_VALUE; } } @@ -212,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, } /** + * pm_qos_flags_remove_req - Remove device PM QoS flags request. + * @pqf: Device PM QoS flags set to remove the request from. + * @req: Request to remove from the set. + */ +static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req) +{ + s32 val = 0; + + list_del(&req->node); + list_for_each_entry(req, &pqf->list, node) + val |= req->flags; + + pqf->effective_flags = val; +} + +/** + * pm_qos_update_flags - Update a set of PM QoS flags. + * @pqf: Set of flags to update. + * @req: Request to add to the set, to modify, or to remove from the set. + * @action: Action to take on the set. + * @val: Value of the request to add or modify. + * + * Update the given set of PM QoS flags and call notifiers if the aggregate + * value has changed. Returns 1 if the aggregate constraint value has changed, + * 0 otherwise. + */ +bool pm_qos_update_flags(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req, + enum pm_qos_req_action action, s32 val) +{ + unsigned long irqflags; + s32 prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + + prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + switch (action) { + case PM_QOS_REMOVE_REQ: + pm_qos_flags_remove_req(pqf, req); + break; + case PM_QOS_UPDATE_REQ: + pm_qos_flags_remove_req(pqf, req); + case PM_QOS_ADD_REQ: + req->flags = val; + INIT_LIST_HEAD(&req->node); + list_add_tail(&req->node, &pqf->list); + pqf->effective_flags |= val; + break; + default: + /* no action */ + ; + } + + curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + return prev_value != curr_value; +} + +/** * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested * @@ -499,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else { ascii_value[count] = '\0'; } - ret = strict_strtoul(ascii_value, 16, &ulval); + ret = kstrtoul(ascii_value, 16, &ulval); if (ret) { pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); return -EINVAL; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3c9d764eb0d8..7c33ed200410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) /* Figure out where to put the new node */ while (*new) { - ext = container_of(*new, struct swsusp_extent, node); + ext = rb_entry(*new, struct swsusp_extent, node); parent = *new; if (swap_offset < ext->start) { /* Try to merge */ diff --git a/kernel/printk.c b/kernel/printk.c index 66a2ea37b576..19c0d7bcf24a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -741,6 +747,21 @@ void __init setup_log_buf(int early) free, (free * 100) / __LOG_BUF_LEN); } +static bool __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str) } __setup("boot_delay=", boot_delay_setup); -static void boot_delay_msec(void) +static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + || (level >= console_loglevel && !ignore_loglevel)) { return; + } k = (unsigned long long)loops_per_msec * boot_delay; @@ -789,7 +812,7 @@ static void boot_delay_msec(void) } } #else -static inline void boot_delay_msec(void) +static inline void boot_delay_msec(int level) { } #endif @@ -1232,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1492,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level, int this_cpu; int printed_len = 0; - boot_delay_msec(); + boot_delay_msec(level); printk_delay(); /* This stops the holder of console_sem just where we want him */ @@ -1890,7 +1898,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, switch (action) { case CPU_ONLINE: case CPU_DEAD: - case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: console_lock(); @@ -1909,12 +1916,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, */ void console_lock(void) { - BUG_ON(in_interrupt()); + might_sleep(); + down(&console_sem); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1936,6 +1945,7 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2096,6 +2106,7 @@ skip: local_irq_restore(flags); } console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) diff --git a/kernel/profile.c b/kernel/profile.c index 76b8e77773ee..1f391819c42f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -8,9 +8,10 @@ * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, * Red Hat, July 2004 * Consolidation of architecture support code for profiling, - * William Irwin, Oracle, July 2004 + * Nadia Yvette Chambers, Oracle, July 2004 * Amortized hit count accounting via per-cpu open-addressed hashtables - * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 + * to resolve timer interrupt livelocks, Nadia Yvette Chambers, + * Oracle, 2004 */ #include <linux/export.h> @@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook); * pagetable hash functions, but uses a full hashtable full of finite * collision chains, not just pairs of them. * - * -- wli + * -- nyc */ static void __profile_flip_buffers(void *unused) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93f..1599157336a6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return has_ns_capability(current, ns, CAP_SYS_PTRACE); } -int __ptrace_may_access(struct task_struct *task, unsigned int mode) +/* Returns 0 on success, -errno on denial. */ +static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; @@ -214,8 +215,12 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + rcu_read_lock(); + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -279,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; + rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); @@ -456,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer) return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (unlikely(p->ptrace & PT_EXITKILL)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, p); + if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc6515..20dfba576c2b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) } } +extern int rcu_expedited; + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4e6a61b15e86..a2cf76177b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,12 +45,16 @@ #include <linux/mutex.h> #include <linux/export.h> #include <linux/hardirq.h> +#include <linux/delay.h> +#include <linux/module.h> #define CREATE_TRACE_POINTS #include <trace/events/rcu.h> #include "rcu.h" +module_param(rcu_expedited, int, 0); + #ifdef CONFIG_PREEMPT_RCU /* @@ -81,6 +85,9 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; +#ifdef CONFIG_PROVE_RCU_DELAY + udelay(10); /* Make preemption more probable. */ +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 547b1fe5b052..e7dce58f9c2a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long oldval) +static void rcu_idle_enter_common(long long newval) { - if (rcu_dynticks_nesting) { + if (newval) { RCU_TRACE(trace_rcu_dyntick("--=", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); + rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - oldval, rcu_dynticks_nesting)); + rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + barrier(); + rcu_dynticks_nesting = newval; } /* @@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval) void rcu_idle_enter(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rcu_dynticks_nesting = 0; + newval = 0; else - rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; + rcu_idle_enter_common(newval); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); void rcu_irq_exit(void) { unsigned long flags; - long long oldval; + long long newval; local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting--; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(oldval); + newval = rcu_dynticks_nesting - 1; + WARN_ON_ONCE(newval < 0); + rcu_idle_enter_common(newval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_exit); /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ static void rcu_idle_exit_common(long long oldval) @@ -171,6 +173,7 @@ void rcu_irq_enter(void) rcu_idle_exit_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_irq_enter); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -192,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); */ int rcu_is_cpu_rrupt_from_idle(void) { - return rcu_dynticks_nesting <= 0; + return rcu_dynticks_nesting <= 1; } /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 918fd1e8509c..f85016a2309b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -278,7 +278,7 @@ static int rcu_boost(void) rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ - raw_local_irq_save(flags); + local_irq_save(flags); /* * Recheck with irqs disabled: all tasks in need of boosting @@ -287,7 +287,7 @@ static int rcu_boost(void) */ if (rcu_preempt_ctrlblk.boost_tasks == NULL && rcu_preempt_ctrlblk.exp_tasks == NULL) { - raw_local_irq_restore(flags); + local_irq_restore(flags); return 0; } @@ -317,7 +317,7 @@ static int rcu_boost(void) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - raw_local_irq_restore(flags); + local_irq_restore(flags); rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -706,7 +706,10 @@ void synchronize_rcu(void) return; /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - rcu_barrier(); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + rcu_barrier(); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -991,9 +994,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); rcp->qlen -= n; - raw_local_irq_restore(flags); + local_irq_restore(flags); } /* diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 25b15033c61f..31dea01c85fd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval; /* Interval between stats, in seconds. */ - /* Defaults to "only at end of test". */ +static int stat_interval = 60; /* Interval between stats, in seconds. */ + /* Zero means "only at end of test". */ static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool test_no_idle_hz = true; + /* Test RCU support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ @@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -176,8 +177,14 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_offline_attempts; static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; static long n_online_attempts; static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; @@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, if (fullstop == FULLSTOP_DONTSTOP) fullstop = FULLSTOP_SHUTDOWN; else - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; @@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, static void rcutorture_shutdown_absorb(char *title) { if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - printk(KERN_NOTICE + pr_notice( "rcutorture thread %s parking due to system shutdown\n", title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); @@ -332,7 +339,6 @@ rcu_stutter_wait(char *title) struct rcu_torture_ops { void (*init)(void); - void (*cleanup)(void); int (*readlock)(void); void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); @@ -424,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -468,7 +473,6 @@ static void rcu_sync_torture_init(void) static struct rcu_torture_ops rcu_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -486,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { static struct rcu_torture_ops rcu_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, @@ -529,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -546,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { static struct rcu_torture_ops rcu_bh_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -563,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { static struct rcu_torture_ops rcu_bh_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -582,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { * Definitions for srcu torture testing. */ -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} +DEFINE_STATIC_SRCU(srcu_ctl); static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) { @@ -665,8 +653,7 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -680,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { }; static struct rcu_torture_ops srcu_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -705,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) } static struct rcu_torture_ops srcu_raw_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -720,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { }; static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -740,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_expedited_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -776,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -792,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -807,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -1214,11 +1194,13 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", - n_online_successes, - n_online_attempts, - n_offline_successes, - n_offline_attempts); + cnt += sprintf(&page[cnt], + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1267,7 +1249,7 @@ rcu_torture_stats_print(void) int cnt; cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); + pr_alert("%s", printk_buf); } /* @@ -1380,20 +1362,24 @@ rcu_torture_stutter(void *arg) static inline void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval, onoff_holdoff); + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, + onoff_interval, onoff_holdoff); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1460,9 +1446,9 @@ rcu_torture_shutdown(void *arg) !kthread_should_stop()) { delta = shutdown_time - jiffies_snap; if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); } @@ -1490,8 +1476,11 @@ static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; + unsigned long delta; int maxcpu = -1; DEFINE_RCU_RANDOM(rand); + int ret; + unsigned long starttime; VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); for_each_online_cpu(cpu) @@ -1506,29 +1495,57 @@ rcu_torture_onoff(void *arg) cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; n_offline_attempts++; - if (cpu_down(cpu) == 0) { + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; } } else if (cpu_is_hotpluggable(cpu)) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; n_online_attempts++; if (cpu_up(cpu) == 0) { if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; } } schedule_timeout_interruptible(onoff_interval * HZ); @@ -1593,14 +1610,14 @@ static int __cpuinit rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - printk(KERN_ALERT "rcu_torture_stall start.\n"); + pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); preempt_disable(); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ preempt_enable(); rcu_read_unlock(); - printk(KERN_ALERT "rcu_torture_stall end.\n"); + pr_alert("rcu_torture_stall end.\n"); } rcutorture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) @@ -1716,12 +1733,12 @@ static int rcu_torture_barrier_init(void) if (n_barrier_cbs == 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - printk(KERN_ALERT "%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - printk(KERN_ALERT "%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); return 0; } atomic_set(&barrier_cbs_count, 0); @@ -1814,7 +1831,7 @@ rcu_torture_cleanup(void) mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { - printk(KERN_WARNING /* but going down anyway, so... */ + pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); @@ -1903,8 +1920,6 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup) - cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || @@ -1938,17 +1953,17 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - printk(KERN_ALERT "rcu-torture types:"); + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - printk(KERN_ALERT " %s", torture_ops[i]->name); - printk(KERN_ALERT "\n"); + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) @@ -1996,14 +2011,15 @@ rcu_torture_init(void) /* Start up the kthreads. */ VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); if (IS_ERR(writer_task)) { firsterr = PTR_ERR(writer_task); VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); writer_task = NULL; goto unwind; } + wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -2118,14 +2134,15 @@ rcu_torture_init(void) } if (shutdown_secs > 0) { shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_run(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); if (IS_ERR(shutdown_task)) { firsterr = PTR_ERR(shutdown_task); VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); shutdown_task = NULL; goto unwind; } + wake_up_process(shutdown_task); } i = rcu_torture_onoff_init(); if (i != 0) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..e441b77b614e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,6 +52,7 @@ #include <linux/prefetch.h> #include <linux/delay.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include "rcutree.h" #include <trace/events/rcu.h> @@ -61,18 +62,19 @@ /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ - .gpnum = -300, \ - .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .gpnum = 0UL - 300UL, \ + .completed = 0UL - 300UL, \ + .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ + .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ } @@ -88,7 +90,7 @@ LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0); +module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ NUM_RCU_LVL_0, @@ -133,13 +135,12 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -175,8 +176,6 @@ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -186,8 +185,6 @@ void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -212,13 +209,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks = ATOMIC_INIT(1), }; -static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static int qhimark = 10000; /* If this many pending, ignore blimit. */ -static int qlowmark = 100; /* Once only this many pending, use blimit. */ +static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ +static long qhimark = 10000; /* If this many pending, ignore blimit. */ +static long qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +module_param(blimit, long, 0444); +module_param(qhimark, long, 0444); +module_param(qlowmark, long, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; @@ -226,7 +223,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); -static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; + +module_param(jiffies_till_first_fqs, ulong, 0644); +module_param(jiffies_till_next_fqs, ulong, 0644); + +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); /* @@ -252,7 +256,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_bh_force_quiescent_state(void) { - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); @@ -286,7 +290,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); */ void rcu_sched_force_quiescent_state(void) { - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); @@ -296,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && + rdp->nxttail[RCU_DONE_TAIL] != NULL; } /* @@ -305,7 +310,12 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); + struct rcu_head **ntp; + + ntp = rdp->nxttail[RCU_DONE_TAIL + + (ACCESS_ONCE(rsp->completed) != rdp->completed)]; + return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && + !rcu_gp_in_progress(rsp); } /* @@ -317,45 +327,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * If the specified CPU is offline, tell the caller that it is in - * a quiescent state. Otherwise, whack it with a reschedule IPI. - * Grace periods can end up waiting on an offline CPU when that - * CPU is in the process of coming online -- it will be added to the - * rcu_node bitmasks before it actually makes it online. The same thing - * can happen while a CPU is in the process of coming online. Because this - * race is quite rare, we check for it after detecting that the grace - * period has been delayed rather than checking each and every CPU - * each and every time we start a new grace period. - */ -static int rcu_implicit_offline_qs(struct rcu_data *rdp) -{ - /* - * If the CPU is offline for more than a jiffy, it is in a quiescent - * state. We can trust its state not to change because interrupts - * are disabled. The reason for the jiffy's worth of slack is to - * handle CPUs initializing on the way up and finding their way - * to the idle loop on the way down. - */ - if (cpu_is_offline(rdp->cpu) && - ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); - rdp->offline_fqs++; - return 1; - } - return 0; -} - -/* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, + bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -372,7 +354,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* - * The idle task is not permitted to enter the idle loop while + * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), @@ -383,6 +365,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) "Illegal idle entry in RCU-sched read-side critical section."); } +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_enter(bool user) +{ + long long oldval; + struct rcu_dynticks *rdtp; + + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; + rcu_eqs_enter_common(rdtp, oldval, user); +} + /** * rcu_idle_enter - inform RCU that current CPU is entering idle * @@ -398,21 +399,48 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) void rcu_idle_enter(void) { unsigned long flags; - long long oldval; + + local_irq_save(flags); + rcu_eqs_enter(false); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_enter); + +#ifdef CONFIG_RCU_USER_QS +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ +void rcu_user_enter(void) +{ + rcu_eqs_enter(1); +} + +/** + * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace + * after the current irq returns. + * + * This is similar to rcu_user_enter() but in the context of a non-nesting + * irq. After this call, RCU enters into idle mode when the interrupt + * returns. + */ +void rcu_user_enter_after_irq(void) +{ + unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rdtp->dynticks_nesting = 0; - else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(rdtp, oldval); + /* Ensure this irq is interrupting a non-idle RCU state. */ + WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); + rdtp->dynticks_nesting = 1; local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -444,18 +472,19 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); } /* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * rcu_eqs_exit_common - current CPU moving away from extended quiescent state * * If the new value of the ->dynticks_nesting counter was previously zero, * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, + int user) { smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); @@ -464,7 +493,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -476,6 +505,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) } } +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_exit(bool user) +{ + struct rcu_dynticks *rdtp; + long long oldval; + + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_eqs_exit_common(rdtp, oldval, user); +} + /** * rcu_idle_exit - inform RCU that current CPU is leaving idle * @@ -490,21 +538,47 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) void rcu_idle_exit(void) { unsigned long flags; + + local_irq_save(flags); + rcu_eqs_exit(false); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_exit); + +#ifdef CONFIG_RCU_USER_QS +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + */ +void rcu_user_exit(void) +{ + rcu_eqs_exit(1); +} + +/** + * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace + * idle mode after the current non-nesting irq returns. + * + * This is similar to rcu_user_exit() but in the context of an irq. + * This is called when the irq has interrupted a userspace RCU idle mode + * context. When the current non-nesting interrupt returns after this call, + * the CPU won't restore the RCU idle mode. + */ +void rcu_user_exit_after_irq(void) +{ + unsigned long flags; struct rcu_dynticks *rdtp; - long long oldval; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(rdtp, oldval); + /* Ensure we are interrupting an RCU idle mode. */ + WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); + rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -539,7 +613,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); } @@ -673,7 +747,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU. + * for this same CPU, or by virtue of having been offline. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { @@ -697,8 +771,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Go check for the CPU being offline. */ - return rcu_implicit_offline_qs(rdp); + /* + * Check for the CPU being offline, but only if the grace period + * is old enough. We don't need to worry about the CPU changing + * state: If we see it offline even once, it has been through a + * quiescent state. + * + * The reason for insisting that the grace period be at least + * one jiffy old is that CPUs that are not quite online and that + * have just gone offline can still execute RCU read-side critical + * sections. + */ + if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) + return 0; /* Grace period is not old enough. */ + barrier(); + if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + rdp->offline_fqs++; + return 1; + } + return 0; } static int jiffies_till_stall_check(void) @@ -725,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } +/* + * Dump stacks of all tasks running on stalled CPUs. This is a fallback + * for architectures that do not implement trigger_all_cpu_backtrace(). + * The NMI-triggered stack traces are more accurate because they are + * printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + dump_cpu_task(rnp->grplo + cpu); + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -732,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) unsigned long flags; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* Only let one CPU complain about others per time interval. */ @@ -755,14 +871,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) { + print_cpu_stall_info(rsp, + rnp->grplo + cpu); + ndetected++; + } + } raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rnp->qsmask == 0) - continue; - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, rnp->grplo + cpu); - ndetected++; - } } /* @@ -775,24 +892,29 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); - printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start)); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start), + rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); - /* If so configured, complain about tasks blocking the grace period. */ + /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); - force_quiescent_state(rsp, 0); /* Kick them all. */ + force_quiescent_state(rsp); /* Kick them all. */ } static void print_cpu_stall(struct rcu_state *rsp) { + int cpu; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* * OK, time to rat on ourselves... @@ -803,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); - printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); @@ -827,7 +952,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) j = ACCESS_ONCE(jiffies); js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + if (rcu_gp_in_progress(rsp) && + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -889,12 +1015,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - if (rnp->qsmask & rdp->grpmask) { - rdp->qs_pending = 1; - rdp->passed_quiesce = 0; - } else { - rdp->qs_pending = 0; - } + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } } @@ -945,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + init_nocb_callback_list(rdp); } /* @@ -974,10 +1097,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * our behalf. Catch up with this state to avoid noting * spurious new grace periods. If another grace period * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. + * we will detect this later on. Of course, any quiescent + * states we found for the old GP are now invalid. */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { rdp->gpnum = rdp->completed; + rdp->passed_quiesce = 0; + } /* * If RCU does not need a quiescent state from this CPU, @@ -1021,97 +1147,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Prior grace period ended, so advance callbacks for current CPU. */ __rcu_process_gp_end(rsp, rnp, rdp); - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - /* Set state so that this CPU will detect the next quiescent state. */ __note_new_gpnum(rsp, rnp, rdp); } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. + * Initialize a new grace period. */ -static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +static int rcu_gp_init(struct rcu_state *rsp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_scheduler_fully_active || - !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either the scheduler hasn't yet spawned the first - * non-idle task or this CPU does not need another - * grace period. Either way, don't start a new grace - * period. - */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - if (rsp->fqs_active) { - /* - * This CPU needs a grace period, but force_quiescent_state() - * is running. Tell it to start one on this CPU's behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; + if (rcu_gp_in_progress(rsp)) { + /* Grace period already in progress, don't start another. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + mutex_lock(&rsp->onoff_mutex); /* * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. This - * operation relies on the layout of the hierarchy within the - * rsp->node[] array. Note that other CPUs will access only - * the leaves of the hierarchy, which still indicate that no + * structures for all currently online CPUs in breadth-first order, + * starting from the root rcu_node structure, relying on the layout + * of the tree within the rsp->node[] array. Note that other CPUs + * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding * leaf node has been initialized. In addition, we have excluded * CPU-hotplug operations. * - * Note that the grace period cannot complete until we finish - * the initialization process, as there will be at least one - * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU, due to the fact that we have - * irqs disabled. + * The grace period cannot complete until the initialization + * process finishes, because this kthread handles both. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); + rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + WARN_ON_ONCE(rnp->completed != rsp->completed); rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); @@ -1119,37 +1204,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); +#ifdef CONFIG_PROVE_RCU_DELAY + if ((random32() % (rcu_num_nodes * 8)) == 0) + schedule_timeout_uninterruptible(2); +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ + cond_resched(); } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + mutex_unlock(&rsp->onoff_mutex); + return 1; } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * Do one round of quiescent-state forcing. */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { - unsigned long gp_duration; + int fqs_state = fqs_state_in; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + rsp->n_force_qs++; + if (fqs_state == RCU_SAVE_DYNTICK) { + /* Collect dyntick-idle snapshots. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + fqs_state = RCU_FORCE_QS; + } else { + /* Handle dyntick-idle and offline CPUs. */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + } + /* Clear flag to prevent immediate re-entry. */ + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + raw_spin_unlock_irq(&rnp->lock); + } + return fqs_state; +} - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ +/* + * Clean up after the old grace period. + */ +static void rcu_gp_cleanup(struct rcu_state *rsp) +{ + unsigned long gp_duration; + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); + + raw_spin_lock_irq(&rnp->lock); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1161,35 +1263,171 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * they can do to advance the grace period. It is therefore * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); - /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->gpnum; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + /* + * Propagate new ->completed value to rcu_node structures so + * that other CPUs don't have to wait until the start of the next + * grace period to process their callbacks. This also avoids + * some nasty RCU grace-period initialization races by forcing + * the end of the current grace period to be completely recorded in + * all of the rcu_node structures before the beginning of the next + * grace period is recorded in any of the rcu_node structures. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + rnp->completed = rsp->gpnum; + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); - rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ + rdp = this_cpu_ptr(rsp->rda); + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); +} + +/* + * Body of kthread that handles grace periods. + */ +static int __noreturn rcu_gp_kthread(void *arg) +{ + int fqs_state; + unsigned long j; + int ret; + struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); + + for (;;) { + + /* Handle grace-period start. */ + for (;;) { + wait_event_interruptible(rsp->gp_wq, + rsp->gp_flags & + RCU_GP_FLAG_INIT); + if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && + rcu_gp_init(rsp)) + break; + cond_resched(); + flush_signals(current); + } + + /* Handle quiescent-state forcing. */ + fqs_state = RCU_SAVE_DYNTICK; + j = jiffies_till_first_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_first_fqs = HZ; + } + for (;;) { + rsp->jiffies_force_qs = jiffies + j; + ret = wait_event_interruptible_timeout(rsp->gp_wq, + (rsp->gp_flags & RCU_GP_FLAG_FQS) || + (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)), + j); + /* If grace period done, leave loop. */ + if (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + /* If time for quiescent-state forcing, do it. */ + if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + fqs_state = rcu_gp_fqs(rsp, fqs_state); + cond_resched(); + } else { + /* Deal with stray signal. */ + cond_resched(); + flush_signals(current); + } + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } + } + + /* Handle grace-period end. */ + rcu_gp_cleanup(rsp); + } +} + +/* + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock, which is released before return. Hard irqs must + * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. + */ +static void +rcu_start_gp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + if (!rsp->gp_kthread || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either we have not yet spawned the grace-period + * task, this CPU does not need another grace period, + * or a grace period is already in progress. + * Either way, don't start a new grace period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + /* + * Because there is no grace period in progress right now, + * any callbacks we have up to this point will be satisfied + * by the next grace period. So promote all callbacks to be + * handled after the end of the next grace period. If the + * CPU is not yet aware of the end of the previous grace period, + * we need to allow for the callback advancement that will + * occur when it does become aware. Deadlock prevents us from + * making it aware at this point: We cannot acquire a leaf + * rcu_node ->lock while holding the root rcu_node ->lock. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + if (rdp->completed == rsp->completed) + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + rsp->gp_flags = RCU_GP_FLAG_INIT; + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + + /* Ensure that CPU is aware of completion of last grace period. */ + rcu_process_gp_end(rsp, rdp); + local_irq_restore(flags); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + +/* + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. + */ +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { + if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum) { /* * The grace period in which this quiescent state was @@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } #ifdef CONFIG_HOTPLUG_CPU @@ -1333,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Send the specified CPU's RCU callbacks to the orphanage. The * specified CPU must be offline, and the caller must hold the - * ->onofflock. + * ->orphan_lock. */ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + /* No-CBs CPUs do not have orphanable callbacks. */ + if (is_nocb_cpu(rdp->cpu)) + return; + /* * Orphan the callbacks. First adjust the counts. This is safe - * because ->onofflock excludes _rcu_barrier()'s adoption of - * the callbacks, thus no memory barrier is required. + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. */ if (rdp->nxtlist != NULL) { rsp->qlen_lazy += rdp->qlen_lazy; @@ -1383,22 +1626,15 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, /* * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->onofflock. + * orphanage. The caller must hold the ->orphan_lock. */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - /* - * If there is an rcu_barrier() operation in progress, then - * only the task doing that operation is permitted to adopt - * callbacks. To do otherwise breaks rcu_barrier() and friends - * by causing them to fail to wait for the callbacks in the - * orphanage. - */ - if (rsp->rcu_barrier_in_progress && - rsp->rcu_barrier_in_progress != current) + /* No-CBs CPUs are handled specially. */ + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) return; /* Do the accounting first. */ @@ -1455,9 +1691,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them, if there is no _rcu_barrier() instance running. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { @@ -1468,13 +1703,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); + mutex_lock(&rsp->onoff_mutex); + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); @@ -1501,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* * We still hold the leaf rcu_node structure lock here, and * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock * held leads to deadlock. */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ rnp = rdp->mynode; if (need_report & RCU_OFL_TASKS_NORM_GP) rcu_report_unblock_qs_rnp(rnp, flags); @@ -1515,14 +1750,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; + mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ -} - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -1541,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy, i; + long bl, count, count_lazy; + int i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1687,6 +1923,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { + cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); if (!rcu_gp_in_progress(rsp)) { @@ -1723,72 +1960,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); - - trace_rcu_utilization("Start fqs"); - if (!rcu_gp_in_progress(rsp)) { - trace_rcu_utilization("End fqs"); - return; /* No grace period in progress, nothing to force. */ - } - if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { - rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - trace_rcu_utilization("End fqs"); - return; /* Someone else is already on the job. */ - } - if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) - goto unlock_fqs_ret; /* no emergency and done recently. */ - rsp->n_force_qs++; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if(!rcu_gp_in_progress(rsp)) { - rsp->n_force_qs_ngp++; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - goto unlock_fqs_ret; /* no GP in progress, time updated. */ - } - rsp->fqs_active = 1; - switch (rsp->fqs_state) { - case RCU_GP_IDLE: - case RCU_GP_INIT: - - break; /* grace period idle or initializing, ignore. */ - - case RCU_SAVE_DYNTICK: - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - - /* Record dyntick-idle state. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; - break; - - case RCU_FORCE_QS: - - /* Check dyntick-idle state, send IPI to laggarts. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - - /* Leave state in case more forcing is required. */ - - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - break; + bool ret; + struct rcu_node *rnp; + struct rcu_node *rnp_old = NULL; + + /* Funnel through hierarchy to reduce memory contention. */ + rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp != NULL; rnp = rnp->parent) { + ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + !raw_spin_trylock(&rnp->fqslock); + if (rnp_old != NULL) + raw_spin_unlock(&rnp_old->fqslock); + if (ret) { + rsp->n_force_qs_lh++; + return; + } + rnp_old = rnp; } - rsp->fqs_active = 0; - if (rsp->fqs_need_gp) { - raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ - rsp->fqs_need_gp = 0; - rcu_start_gp(rsp, flags); /* releases rnp->lock */ - trace_rcu_utilization("End fqs"); - return; + /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ + + /* Reached the root of the rcu_node tree, acquire lock. */ + raw_spin_lock_irqsave(&rnp_old->lock, flags); + raw_spin_unlock(&rnp_old->fqslock); + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + rsp->n_force_qs_lh++; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + return; /* Someone beat us to it. */ } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -unlock_fqs_ret: - raw_spin_unlock_irqrestore(&rsp->fqslock, flags); - trace_rcu_utilization("End fqs"); + rsp->gp_flags |= RCU_GP_FLAG_FQS; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1805,13 +2009,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); /* - * If an RCU GP has gone long enough, go check for dyntick - * idle CPUs and, if needed, send resched IPIs. - */ - if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - - /* * Advance callbacks in response to end of earlier grace * period that some other CPU ended. */ @@ -1838,6 +2035,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; + if (cpu_is_offline(smp_processor_id())) + return; trace_rcu_utilization("Start RCU core"); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); @@ -1909,17 +2108,22 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); + force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + } } +/* + * Helper function for call_rcu() and friends. The cpu argument will + * normally be -1, indicating "currently running CPU". It may specify + * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * is expected to specify a CPU. + */ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp, bool lazy) + struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -1929,8 +2133,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), head->func = func; head->next = NULL; - smp_mb(); /* Ensure RCU update seen before callback registry. */ - /* * Opportunistically note grace-period endings and beginnings. * Note that we might see a beginning right after we see an @@ -1941,6 +2143,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + int offline; + + if (cpu != -1) + rdp = per_cpu_ptr(rsp->rda, cpu); + offline = !__call_rcu_nocb(rdp, head, lazy); + WARN_ON_ONCE(offline); + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + local_irq_restore(flags); + return; + } ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; @@ -1966,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 0); + __call_rcu(head, func, &rcu_sched_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); @@ -1975,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state, 0); + __call_rcu(head, func, &rcu_bh_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2011,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void) * rcu_read_lock_sched(). * * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. + * non-threaded hardware-interrupt handlers, in progress on entry will + * have completed before this primitive returns. However, this does not + * guarantee that softirq handlers will have completed, since in some + * kernels, these handlers can run in process context, and can block. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_sched() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-sched read-side critical section whose beginning + * preceded the call to synchronize_sched(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_sched() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_sched() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_sched(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). * * This primitive provides the guarantees made by the (now removed) * synchronize_kernel() API. In contrast, synchronize_rcu() only @@ -2030,7 +2261,10 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_sched); + if (rcu_expedited) + synchronize_sched_expedited(); + else + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -2042,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), * and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu_bh(void) { @@ -2051,13 +2288,13 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_bh); + if (rcu_expedited) + synchronize_rcu_bh_expedited(); + else + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); - static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2114,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { - int firstsnap, s, snap, trycount = 0; + long firstsnap, s, snap; + int trycount = 0; + struct rcu_state *rsp = &rcu_sched_state; - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + /* + * If we are in danger of counter wrap, just do synchronize_sched(). + * By allowing sync_sched_expedited_started to advance no more than + * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring + * that more than 3.5 billion CPUs would be required to force a + * counter wrap on a 32-bit system. Quite a few more CPUs would of + * course be required on a 64-bit system. + */ + if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), + (ulong)atomic_long_read(&rsp->expedited_done) + + ULONG_MAX / 8)) { + synchronize_sched(); + atomic_long_inc(&rsp->expedited_wrap); + return; + } + + /* + * Take a ticket. Note that atomic_inc_return() implies a + * full memory barrier. + */ + snap = atomic_long_inc_return(&rsp->expedited_start); + firstsnap = snap; get_online_cpus(); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); @@ -2129,48 +2388,65 @@ void synchronize_sched_expedited(void) synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); + atomic_long_inc(&rsp->expedited_tryfail); + + /* Check to see if someone else did our work for us. */ + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone1); + return; + } /* No joy, try again later. Or just synchronize_sched(). */ if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - synchronize_sched(); + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); return; } - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* Recheck to see if someone else did our work for us. */ + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone2); return; } /* * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. + * callers to piggyback on our grace period. We retry + * after they started, so our grace period works for them, + * and they started after our first try, so their grace + * period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); + snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } + atomic_long_inc(&rsp->expedited_stoppedcpus); /* * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. + * than we did already did their update. */ do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ + atomic_long_inc(&rsp->expedited_done_tries); + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_done_lost); break; } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); + atomic_long_inc(&rsp->expedited_done_exit); put_online_cpus(); } @@ -2195,17 +2471,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { - - /* - * If force_quiescent_state() coming soon and this CPU - * needs a quiescent state, and this is either RCU-sched - * or RCU-bh, force a local reschedule. - */ rdp->n_rp_qs_pending++; - if (!rdp->preemptible && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, - jiffies)) - set_need_resched(); } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; @@ -2235,13 +2501,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } - /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (rcu_gp_in_progress(rsp) && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { - rdp->n_rp_need_fqs++; - return 1; - } - /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2326,13 +2585,10 @@ static void rcu_barrier_func(void *type) static void _rcu_barrier(struct rcu_state *rsp) { int cpu; - unsigned long flags; struct rcu_data *rdp; - struct rcu_data rd; unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); unsigned long snap_done; - init_rcu_head_on_stack(&rd.barrier_head); _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ @@ -2372,70 +2628,38 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Initialize the count to one rather than to zero in order to * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Also flag this task as doing - * an rcu_barrier(). This will prevent anyone else from adopting - * orphaned callbacks, which could cause otherwise failure if a - * CPU went offline and quickly came back online. To see this, - * consider the following sequence of events: - * - * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. - * 2. CPU 1 goes offline, orphaning its callbacks. - * 3. CPU 0 adopts CPU 1's orphaned callbacks. - * 4. CPU 1 comes back online. - * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. - * 6. Both rcu_barrier_callback() callbacks are invoked, awakening - * us -- but before CPU 1's orphaned callbacks are invoked!!! + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. */ init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rsp->rcu_barrier_in_progress = current; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + get_online_cpus(); /* - * Force every CPU with callbacks to register a new callback - * that will tell us when all the preceding callbacks have - * been invoked. If an offline CPU has callbacks, wait for - * it to either come back online or to finish orphaning those - * callbacks. + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - preempt_disable(); + if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_is_offline(cpu)) { - _rcu_barrier_trace(rsp, "Offline", cpu, + if (is_nocb_cpu(cpu)) { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); - preempt_enable(); - while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) - schedule_timeout_interruptible(1); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, rcu_barrier_callback, + rsp, cpu, 0); } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - preempt_enable(); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, rsp->n_barrier_done); - preempt_enable(); } } - - /* - * Now that all online CPUs have rcu_barrier_callback() callbacks - * posted, we can adopt all of the orphaned callbacks and place - * an rcu_barrier_callback() callback after them. When that is done, - * we are guaranteed to have an rcu_barrier_callback() callback - * following every callback that could possibly have been - * registered before _rcu_barrier() was called. - */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rcu_adopt_orphan_cbs(rsp); - rsp->rcu_barrier_in_progress = NULL; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rsp->barrier_cpu_count); - smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rd.rsp = rsp; - rsp->call(&rd.barrier_head, rcu_barrier_callback); + put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2456,8 +2680,6 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); - - destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2497,8 +2719,12 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); +#ifdef CONFIG_RCU_USER_QS + WARN_ON_ONCE(rdp->dynticks->in_user); +#endif rdp->cpu = cpu; rdp->rsp = rsp; + rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2516,6 +2742,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); + /* Exclude new grace periods. */ + mutex_lock(&rsp->onoff_mutex); + /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -2523,20 +2752,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* - * A new grace period might start here. If so, we won't be part - * of it, but that is OK, as we are currently in a quiescent state. - */ - - /* Exclude any attempts to start a new GP on large systems. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; @@ -2555,14 +2777,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - rdp->passed_quiesce_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + local_irq_restore(flags); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + mutex_unlock(&rsp->onoff_mutex); } static void __cpuinit rcu_prepare_cpu(int cpu) @@ -2584,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; + int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2594,12 +2817,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); + rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); + if (nocb_cpu_expendable(cpu)) + rcu_boost_kthread_setaffinity(rnp, cpu); + else + ret = NOTIFY_BAD; break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -2623,8 +2847,31 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return NOTIFY_OK; + return ret; +} + +/* + * Spawn the kthread that handles this RCU flavor's grace periods. + */ +static int __init rcu_spawn_gp_kthread(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp; + struct task_struct *t; + + for_each_rcu_flavor(rsp) { + t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + rsp->gp_kthread = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_spawn_nocb_kthreads(rsp); + } + return 0; } +early_initcall(rcu_spawn_gp_kthread); /* * This function is invoked towards the end of the scheduler's initialization @@ -2661,7 +2908,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int cprv; int i; - cprv = NR_CPUS; + cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; @@ -2676,10 +2923,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_level_0", - "rcu_node_level_1", - "rcu_node_level_2", - "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ + static char *buf[] = { "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static char *fqs[] = { "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; @@ -2704,7 +2955,11 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); - rnp->gpnum = 0; + raw_spin_lock_init(&rnp->fqslock); + lockdep_set_class_and_name(&rnp->fqslock, + &rcu_fqs_class[i], fqs[i]); + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -2727,6 +2982,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; + init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -2750,7 +3006,8 @@ static void __init rcu_init_geometry(void) int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + nr_cpu_ids == NR_CPUS) return; /* @@ -2806,6 +3063,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + rcu_init_nocb(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..4b69291b093d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,6 +102,10 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ +#ifdef CONFIG_RCU_USER_QS + bool ignore_user_qs; /* Treat userspace as extended QS or not */ + bool in_user; /* Is the CPU in userland from RCU POV? */ +#endif }; /* RCU's kthread states for tracing. */ @@ -196,12 +200,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ - struct task_struct *node_kthread_task; - /* kthread that takes care of this rcu_node */ - /* structure, for example, awakening the */ - /* per-CPU kthreads as needed. */ - unsigned int node_kthread_status; - /* State of node_kthread_task for tracing. */ + raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -245,8 +244,6 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesce_gpnum; - /* gpnum at time of quiescent state. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ @@ -290,6 +287,7 @@ struct rcu_data { long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; @@ -312,11 +310,25 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; - unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() callback. */ + /* 6) _rcu_barrier() and OOM callbacks. */ struct rcu_head barrier_head; +#ifdef CONFIG_RCU_FAST_NO_HZ + struct rcu_head oom_head; +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + + /* 7) Callback offloading. */ +#ifdef CONFIG_RCU_NOCB_CPU + struct rcu_head *nocb_head; /* CBs waiting for kthread. */ + struct rcu_head **nocb_tail; + atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ + atomic_long_t nocb_q_count_lazy; /* (approximate). */ + int nocb_p_count; /* # CBs being invoked by kthread */ + int nocb_p_count_lazy; /* (approximate). */ + wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_kthread; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int cpu; struct rcu_state *rsp; @@ -370,26 +382,28 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); +#ifdef CONFIG_RCU_NOCB_CPU + void (*call_remote)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + /* call_rcu() flavor, but for */ + /* placing on remote CPU. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ - u8 fqs_active; /* force_quiescent_state() */ - /* is running. */ - u8 fqs_need_gp; /* A CPU was prevented from */ - /* starting a new grace */ - /* period because */ - /* force_quiescent_state() */ - /* was running. */ u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + struct task_struct *gp_kthread; /* Task for grace periods. */ + wait_queue_head_t gp_wq; /* Where GP task waits. */ + int gp_flags; /* Commands for GP task. */ /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; + /* Protect following fields. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ struct rcu_head **orphan_nxttail; /* Tail of above. */ @@ -398,16 +412,29 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - struct task_struct *rcu_barrier_in_progress; - /* Task doing rcu_barrier(), */ - /* or NULL if no barrier. */ + /* End of fields guarded by orphan_lock. */ + + struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ - raw_spinlock_t fqslock; /* Only one task forcing */ - /* quiescent states. */ + /* End of fields guarded by barrier_mutex. */ + + atomic_long_t expedited_start; /* Starting ticket. */ + atomic_long_t expedited_done; /* Done ticket. */ + atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ + atomic_long_t expedited_done_tries; /* # tries to update _done. */ + atomic_long_t expedited_done_lost; /* # times beaten to _done. */ + atomic_long_t expedited_done_exit; /* # times exited _done loop. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -426,7 +453,13 @@ struct rcu_state { struct list_head flavors; /* List of RCU flavors. */ }; +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ +#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ + extern struct list_head rcu_struct_flavors; + +/* Sequence through rcu_state structures for each RCU flavor. */ #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) @@ -468,7 +501,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); -static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); @@ -491,15 +523,9 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index); -static void invoke_rcu_node_kthread(struct rcu_node *rnp); -static void rcu_yield(void (*f)(unsigned long), unsigned long arg); + struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); @@ -510,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool is_nocb_cpu(int cpu); +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy); +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp); +static bool nocb_cpu_expendable(int cpu); +static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); +static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void init_nocb_callback_list(struct rcu_data *rdp); +static void __init rcu_init_nocb(void); #endif /* #ifndef RCU_TREE_NONCORE */ + +#ifdef CONFIG_RCU_TRACE +#ifdef CONFIG_RCU_NOCB_CPU +/* Sum up queue lengths for tracing. */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; +} +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = 0; + *qll = 0; +} +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..f6e5ec2932b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,9 @@ */ #include <linux/delay.h> +#include <linux/gfp.h> +#include <linux/oom.h> +#include <linux/smpboot.h> #define RCU_KTHREAD_PRIO 1 @@ -34,6 +37,14 @@ #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO #endif +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ +static bool rcu_nocb_poll; /* Offload kthread are to poll. */ +module_param(rcu_nocb_poll, bool, 0444); +static char __initdata nocb_buf[NR_CPUS * 5]; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -74,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_NOCB_CPU + if (have_rcu_nocb_mask) { + if (cpumask_test_cpu(0, rcu_nocb_mask)) { + cpumask_clear_cpu(0, rcu_nocb_mask); + pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tExperimental polled no-CBs CPUs.\n"); + } +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } #ifdef CONFIG_TREE_PREEMPT_RCU @@ -118,7 +141,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -136,8 +159,6 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -422,9 +443,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) @@ -584,17 +607,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; #ifdef CONFIG_RCU_BOOST - /* In case root is being boosted and leaf is not. */ + rnp->boost_tasks = NULL; + /* + * In case root is being boosted and leaf was not. Make sure + * that we boost the tasks blocking the current grace period + * in this case. + */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks != rnp_root->gp_tasks && + rnp_root->boost_tasks != rnp_root->exp_tasks) rnp_root->boost_tasks = rnp_root->gp_tasks; raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ #endif /* #ifdef CONFIG_RCU_BOOST */ - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; return retval; } @@ -634,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 0); + __call_rcu(head, func, &rcu_preempt_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -648,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 1); + __call_rcu(head, func, &rcu_preempt_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -662,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu(void) { @@ -671,12 +703,15 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - wait_rcu_gp(call_rcu); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static long sync_rcu_preempt_exp_count; +static unsigned long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -749,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * grace period for the specified rcu_node structure. If there are no such * tasks, report it up the rcu_node hierarchy. * - * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + * Caller must hold sync_rcu_preempt_exp_mutex and must exclude + * CPU hotplug operations. */ static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) @@ -791,7 +827,7 @@ void synchronize_rcu_expedited(void) unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; - long snap; + unsigned long snap; int trycount = 0; smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -799,33 +835,47 @@ void synchronize_rcu_expedited(void) smp_mb(); /* Above access cannot bleed into critical section. */ /* + * Block CPU-hotplug operations. This means that any CPU-hotplug + * operation that finds an rcu_node structure with tasks in the + * process of being boosted will know that all tasks blocking + * this expedited grace period will already be in the process of + * being boosted. This simplifies the process of moving tasks + * from leaf to root rcu_node structures. + */ + get_online_cpus(); + + /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (ULONG_CMP_LT(snap, + ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); + goto mb_ret; /* Others did our work for us. */ + } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - synchronize_rcu(); + put_online_cpus(); + wait_rcu_gp(call_rcu); return; } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) - goto mb_ret; /* Others did our work for us. */ } - if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); goto unlock_mb_ret; /* Others did our work for us. */ + } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Snapshot current state of ->blkd_tasks lists. */ @@ -834,7 +884,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + put_online_cpus(); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); @@ -853,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. */ void rcu_barrier(void) { @@ -991,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 1); + __call_rcu(head, func, &rcu_sched_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -1069,6 +1124,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1141,17 +1206,6 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Timer handler to initiate waking up of boost kthreads that - * have yielded the CPU due to excessive numbers of tasks to - * boost. We wake up the per-rcu_node kthread, which in turn - * will wake up the booster kthread. - */ -static void rcu_boost_kthread_timer(unsigned long arg) -{ - invoke_rcu_node_kthread((struct rcu_node *)arg); -} - -/* * Priority-boosting kthread. One per leaf rcu_node and one for the * root rcu_node. */ @@ -1174,8 +1228,9 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; trace_rcu_utilization("End boost kthread@rcu_yield"); - rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + schedule_timeout_interruptible(2); trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } @@ -1191,9 +1246,9 @@ static int rcu_boost_kthread(void *arg) * kthread to start boosting them. If there is an expedited grace * period in progress, it is always time to boost. * - * The caller must hold rnp->lock, which this function releases, - * but irqs remain disabled. The ->boost_kthread_task is immortal, - * so we don't need to worry about it going away. + * The caller must hold rnp->lock, which this function releases. + * The ->boost_kthread_task is immortal, so we don't need to worry + * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { @@ -1213,8 +1268,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; - if (t != NULL) - wake_up_process(t); + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1231,8 +1286,10 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } local_irq_restore(flags); } @@ -1245,21 +1302,6 @@ static bool rcu_is_callbacks_kthread(void) return __get_cpu_var(rcu_cpu_kthread_task) == current; } -/* - * Set the affinity of the boost kthread. The CPU-hotplug locks are - * held, so no one should be messing with the existence of the boost - * kthread. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) -{ - struct task_struct *t; - - t = rnp->boost_kthread_task; - if (t != NULL) - set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1276,15 +1318,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) + struct rcu_node *rnp) { + int rnp_index = rnp - &rsp->node[0]; unsigned long flags; struct sched_param sp; struct task_struct *t; if (&rcu_preempt_state != rsp) return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; @@ -1301,25 +1347,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Stop the RCU's per-CPU kthread when its CPU goes offline,. - */ -static void rcu_stop_cpu_kthread(int cpu) -{ - struct task_struct *t; - - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1327,112 +1354,22 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) -{ - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_cpu_kthread_setup(unsigned int cpu) { - int policy; struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +static void rcu_cpu_kthread_park(unsigned int cpu) { - struct sched_param sp; - struct timer_list yield_timer; - int prio = current->rt_priority; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - set_user_nice(current, 0); - sp.sched_priority = prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) +static int rcu_cpu_kthread_should_run(unsigned int cpu) { - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; + return __get_cpu_var(rcu_cpu_has_work); } /* @@ -1440,138 +1377,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) * RCU softirq used in flavors and configurations of RCU that do not * support RCU priority boosting. */ -static int rcu_cpu_kthread(void *arg) +static void rcu_cpu_kthread(unsigned int cpu) { - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + int spincnt; - trace_rcu_utilization("Start CPU kthread@init"); - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End CPU kthread@rcu_wait"); - rcu_wait(*workp != 0 || kthread_should_stop()); + for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); work = *workp; *workp = 0; - local_irq_restore(flags); + local_irq_enable(); if (work) rcu_kthread_do_work(); local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End CPU kthread@rcu_yield"); - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); - trace_rcu_utilization("Start CPU kthread@rcu_yield"); - spincnt = 0; - } - } - *statusp = RCU_KTHREAD_STOPPED; - trace_rcu_utilization("End CPU kthread@term"); - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create_on_node(rcu_cpu_kthread, - (void *)(long)cpu, - cpu_to_node(cpu), - "rcuc/%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); + if (*workp == 0) { + trace_rcu_utilization("End CPU kthread@rcu_wait"); + *statusp = RCU_KTHREAD_WAITING; + return; } } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("Start CPU kthread@rcu_yield"); + schedule_timeout_interruptible(2); + trace_rcu_utilization("End CPU kthread@rcu_yield"); + *statusp = RCU_KTHREAD_WAITING; } /* @@ -1583,17 +1417,17 @@ static int rcu_node_kthread(void *arg) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; cpumask_var_t cm; int cpu; - unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL) + if (!t) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); @@ -1603,62 +1437,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(cpu, cm); WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); + set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_scheduler_fully_active || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; /* * Spawn all kthreads -- called as soon as the scheduler is running. */ static int __init rcu_spawn_kthreads(void) { - int cpu; struct rcu_node *rnp; + int cpu; rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); if (NUM_RCU_NODES > 1) { rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } return 0; } @@ -1670,11 +1478,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } + if (rcu_scheduler_fully_active) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1698,19 +1503,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_cpu_kthread(int cpu) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } @@ -1997,6 +1790,26 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; + /* Adaptive-tick mode, where usermode execution is idle to RCU. */ + if (!is_idle_task(current)) { + rdtp->dyntick_holdoff = jiffies - 1; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("User dyntick with callbacks"); + rdtp->idle_gp_timer_expires = + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); + } else if (rcu_cpu_has_callbacks(cpu)) { + rdtp->idle_gp_timer_expires = + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + trace_rcu_prep_idle("User dyntick with lazy callbacks"); + } else { + return; + } + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + return; + } + /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle @@ -2075,16 +1888,16 @@ static void rcu_prepare_for_idle(int cpu) #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state, 0); + force_quiescent_state(&rcu_preempt_state); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } /* @@ -2112,6 +1925,88 @@ static void rcu_idle_count_callbacks_posted(void) __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } +/* + * Data for flushing lazy RCU callbacks at OOM time. + */ +static atomic_t oom_callback_count; +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); + +/* + * RCU OOM callback -- decrement the outstanding count and deliver the + * wake-up if we are the last one. + */ +static void rcu_oom_callback(struct rcu_head *rhp) +{ + if (atomic_dec_and_test(&oom_callback_count)) + wake_up(&oom_callback_wq); +} + +/* + * Post an rcu_oom_notify callback on the current CPU if it has at + * least one lazy callback. This will unnecessarily post callbacks + * to CPUs that already have a non-lazy callback at the end of their + * callback list, but this is an infrequent operation, so accept some + * extra overhead to keep things simple. + */ +static void rcu_oom_notify_cpu(void *unused) +{ + struct rcu_state *rsp; + struct rcu_data *rdp; + + for_each_rcu_flavor(rsp) { + rdp = __this_cpu_ptr(rsp->rda); + if (rdp->qlen_lazy != 0) { + atomic_inc(&oom_callback_count); + rsp->call(&rdp->oom_head, rcu_oom_callback); + } + } +} + +/* + * If low on memory, ensure that each CPU has a non-lazy callback. + * This will wake up CPUs that have only lazy callbacks, in turn + * ensuring that they free up the corresponding memory in a timely manner. + * Because an uncertain amount of memory will be freed in some uncertain + * timeframe, we do not claim to have freed anything. + */ +static int rcu_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + int cpu; + + /* Wait for callbacks from earlier instance to complete. */ + wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + + /* + * Prevent premature wakeup: ensure that all increments happen + * before there is a chance of the counter reaching zero. + */ + atomic_set(&oom_callback_count, 1); + + get_online_cpus(); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); + cond_resched(); + } + put_online_cpus(); + + /* Unconditionally decrement: no need to wake ourselves up. */ + atomic_dec(&oom_callback_count); + + return NOTIFY_OK; +} + +static struct notifier_block rcu_oom_nb = { + .notifier_call = rcu_oom_notify +}; + +static int __init rcu_register_oom_notifier(void) +{ + register_oom_notifier(&rcu_oom_nb); + return 0; +} +early_initcall(rcu_register_oom_notifier); + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -2122,11 +2017,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct timer_list *tltp = &rdtp->idle_gp_timer; + char c; - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, - rdtp->dyntick_holdoff == jiffies ? 'H' : '.', - timer_pending(tltp) ? tltp->expires - jiffies : -1); + c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; + if (timer_pending(tltp)) + sprintf(cp, "drain=%d %c timer=%lu", + rdtp->dyntick_drain, c, tltp->expires - jiffies); + else + sprintf(cp, "drain=%d %c timer not pending", + rdtp->dyntick_drain, c); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -2194,11 +2093,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) /* Increment ->ticks_this_gp for all flavors of RCU. */ static void increment_cpu_stall_ticks(void) { - __get_cpu_var(rcu_sched_data).ticks_this_gp++; - __get_cpu_var(rcu_bh_data).ticks_this_gp++; -#ifdef CONFIG_TREE_PREEMPT_RCU - __get_cpu_var(rcu_preempt_data).ticks_this_gp++; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + __this_cpu_ptr(rsp->rda)->ticks_this_gp++; } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -2227,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) } #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +#ifdef CONFIG_RCU_NOCB_CPU + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For each CPU in the set, there is a + * kthread created that pulls the callbacks from the corresponding CPU, + * waits for a grace period to elapse, and invokes the callbacks. + * The no-CBs CPUs do a wake_up() on their kthread when they insert + * a callback into any empty list, unless the rcu_nocb_poll boot parameter + * has been specified, in which case each kthread actively polls its + * CPU. (Which isn't so great for energy efficiency, but which does + * reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callback processing could also in theory be used as + * an energy-efficiency measure because CPUs with no RCU callbacks + * queued are more aggressive about entering dyntick-idle mode. + */ + + +/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + cpulist_parse(str, rcu_nocb_mask); + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +/* Is the specified CPU a no-CPUs CPU? */ +static bool is_nocb_cpu(int cpu) +{ + if (have_rcu_nocb_mask) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +/* + * Enqueue the specified string of rcu_head structures onto the specified + * CPU's no-CBs lists. The CPU is specified by rdp, the head of the + * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy + * counts are supplied by rhcount and rhcount_lazy. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, + struct rcu_head *rhp, + struct rcu_head **rhtp, + int rhcount, int rhcount_lazy) +{ + int len; + struct rcu_head **old_rhpp; + struct task_struct *t; + + /* Enqueue the callback on the nocb list and update counts. */ + old_rhpp = xchg(&rdp->nocb_tail, rhtp); + ACCESS_ONCE(*old_rhpp) = rhp; + atomic_long_add(rhcount, &rdp->nocb_q_count); + atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + + /* If we are not being polled and there is a kthread, awaken it ... */ + t = ACCESS_ONCE(rdp->nocb_kthread); + if (rcu_nocb_poll | !t) + return; + len = atomic_long_read(&rdp->nocb_q_count); + if (old_rhpp == &rdp->nocb_head) { + wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + rdp->qlen_last_fqs_check = 0; + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + wake_up_process(t); /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = LONG_MAX / 2; + } + return; +} + +/* + * This is a helper for __call_rcu(), which invokes this when the normal + * callback queue is inoperable. If this is not a no-CBs CPU, this + * function returns failure back to __call_rcu(), which can complain + * appropriately. + * + * Otherwise, this function queues the callback where the corresponding + * "rcuo" kthread can find it. + */ +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + + if (!is_nocb_cpu(rdp->cpu)) + return 0; + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + return 1; +} + +/* + * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is + * not a no-CBs CPU. + */ +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + long ql = rsp->qlen; + long qll = rsp->qlen_lazy; + + /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + if (!is_nocb_cpu(smp_processor_id())) + return 0; + rsp->qlen = 0; + rsp->qlen_lazy = 0; + + /* First, enqueue the donelist, if any. This preserves CB ordering. */ + if (rsp->orphan_donelist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, + rsp->orphan_donetail, ql, qll); + ql = qll = 0; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + if (rsp->orphan_nxtlist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, + rsp->orphan_nxttail, ql, qll); + ql = qll = 0; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } + return 1; +} + +/* + * There must be at least one non-no-CBs CPU in operation at any given + * time, because no-CBs CPUs are not capable of initiating grace periods + * independently. This function therefore complains if the specified + * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to + * avoid offlining the last such CPU. (Recursion is a wonderful thing, + * but you have to have a base case!) + */ +static bool nocb_cpu_expendable(int cpu) +{ + cpumask_var_t non_nocb_cpus; + int ret; + + /* + * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, + * then offlining this CPU is harmless. Let it happen. + */ + if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) + return 1; + + /* If no memory, play it safe and keep the CPU around. */ + if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) + return 0; + cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); + cpumask_clear_cpu(cpu, non_nocb_cpus); + ret = !cpumask_empty(non_nocb_cpus); + free_cpumask_var(non_nocb_cpus); + return ret; +} + +/* + * Helper structure for remote registry of RCU callbacks. + * This is needed for when a no-CBs CPU needs to start a grace period. + * If it just invokes call_rcu(), the resulting callback will be queued, + * which can result in deadlock. + */ +struct rcu_head_remote { + struct rcu_head *rhp; + call_rcu_func_t *crf; + void (*func)(struct rcu_head *rhp); +}; + +/* + * Register a callback as specified by the rcu_head_remote struct. + * This function is intended to be invoked via smp_call_function_single(). + */ +static void call_rcu_local(void *arg) +{ + struct rcu_head_remote *rhrp = + container_of(arg, struct rcu_head_remote, rhp); + + rhrp->crf(rhrp->rhp, rhrp->func); +} + +/* + * Set up an rcu_head_remote structure and the invoke call_rcu_local() + * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via + * smp_call_function_single(). + */ +static void invoke_crf_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp), + call_rcu_func_t crf) +{ + struct rcu_head_remote rhr; + + rhr.rhp = rhp; + rhr.crf = crf; + rhr.func = func; + smp_call_function_single(0, call_rcu_local, &rhr, 1); +} + +/* + * Helper functions to be passed to wait_rcu_gp(), each of which + * invokes invoke_crf_remote() to register a callback appropriately. + */ +static void __maybe_unused +call_rcu_preempt_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu); +} +static void call_rcu_bh_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_bh); +} +static void call_rcu_sched_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_sched); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes + * callbacks queued by the corresponding no-CBs CPU. + */ +static int rcu_nocb_kthread(void *arg) +{ + int c, cl; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_head **tail; + struct rcu_data *rdp = arg; + + /* Each pass through this loop invokes one batch of callbacks */ + for (;;) { + /* If not polling, wait for next batch of callbacks. */ + if (!rcu_nocb_poll) + wait_event(rdp->nocb_wq, rdp->nocb_head); + list = ACCESS_ONCE(rdp->nocb_head); + if (!list) { + schedule_timeout_interruptible(1); + continue; + } + + /* + * Extract queued callbacks, update counts, and wait + * for a grace period to elapse. + */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + c = atomic_long_xchg(&rdp->nocb_q_count, 0); + cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + ACCESS_ONCE(rdp->nocb_p_count) += c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; + wait_rcu_gp(rdp->rsp->call_remote); + + /* Each pass through the following loop invokes a callback. */ + trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + c = cl = 0; + while (list) { + next = list->next; + /* Wait for enqueuing to complete, if needed. */ + while (next == NULL && &list->next != tail) { + schedule_timeout_interruptible(1); + next = list->next; + } + debug_rcu_head_unqueue(list); + local_bh_disable(); + if (__rcu_reclaim(rdp->rsp->name, list)) + cl++; + c++; + local_bh_enable(); + list = next; + } + trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + ACCESS_ONCE(rdp->nocb_p_count) -= c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + rdp->n_nocbs_invoked += c; + } + return 0; +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + rdp->nocb_tail = &rdp->nocb_head; + init_waitqueue_head(&rdp->nocb_wq); +} + +/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + struct task_struct *t; + + if (rcu_nocb_mask == NULL) + return; + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(rsp->rda, cpu); + t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp->nocb_kthread) = t; + } +} + +/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ +static void init_nocb_callback_list(struct rcu_data *rdp) +{ + if (rcu_nocb_mask == NULL || + !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + return; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; +} + +/* Initialize the ->call_remote fields in the rcu_state structures. */ +static void __init rcu_init_nocb(void) +{ +#ifdef CONFIG_PREEMPT_RCU + rcu_preempt_state.call_remote = call_rcu_preempt_remote; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + rcu_bh_state.call_remote = call_rcu_bh_remote; + rcu_sched_state.call_remote = call_rcu_sched_remote; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static bool is_nocb_cpu(int cpu) +{ + return false; +} + +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + return 0; +} + +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + return 0; +} + +static bool nocb_cpu_expendable(int cpu) +{ + return 1; +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ +} + +static void init_nocb_callback_list(struct rcu_data *rdp) +{ +} + +static void __init rcu_init_nocb(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..0d095dcaa670 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,29 +46,58 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -static int show_rcubarrier(struct seq_file *m, void *unused) +#define ulong2long(a) (*(long *)(&(a))) + +static int r_open(struct inode *inode, struct file *file, + const struct seq_operations *op) { - struct rcu_state *rsp; + int ret = seq_open(file, op); + if (!ret) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + *pos = cpumask_next(*pos - 1, cpu_possible_mask); + if ((*pos) < nr_cpu_ids) + return per_cpu_ptr(rsp->rda, *pos); + return NULL; +} - for_each_rcu_flavor(rsp) - seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", - rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return r_start(m, pos); +} + +static void r_stop(struct seq_file *m, void *v) +{ +} + +static int show_rcubarrier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + seq_printf(m, "bcc: %d nbd: %lu\n", + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); return 0; } static int rcubarrier_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcubarrier, NULL); + return single_open(file, show_rcubarrier, inode->i_private); } static const struct file_operations rcubarrier_fops = { .owner = THIS_MODULE, .open = rcubarrier_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; #ifdef CONFIG_RCU_BOOST @@ -84,22 +113,26 @@ static char convert_kthread_status(unsigned int kthread_status) static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { + long ql, qll; + if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); + ulong2long(rdp->completed), ulong2long(rdp->gpnum), + rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); + rcu_nocb_q_lengths(rdp, &ql, &qll); + qll += rdp->qlen_lazy; + ql += rdp->qlen; seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - rdp->qlen_lazy, rdp->qlen, + qll, ql, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -108,110 +141,74 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", + seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); + seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_nocbs_invoked, + rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -static int show_rcudata(struct seq_file *m, void *unused) +static int show_rcudata(struct seq_file *m, void *v) { - int cpu; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); - } + print_one_rcu_data(m, (struct rcu_data *)v); return 0; } +static const struct seq_operations rcudate_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = show_rcudata, +}; + static int rcudata_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcudata, NULL); + return r_open(inode, file, &rcudate_op); } static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; -static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) +static int show_rcuexp(struct seq_file *m, void *v) { - if (!rdp->beenonline) - return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); - seq_printf(m, ",%d,%llx,%d,%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, ",%lu", rdp->offline_fqs); - seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, ",%d,\"%c\"", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu))); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, ",%ld", rdp->blimit); - seq_printf(m, ",%lu,%lu,%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata_csv(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); - for_each_rcu_flavor(rsp) { - seq_printf(m, "\"%s:\"\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); - } + struct rcu_state *rsp = (struct rcu_state *)m->private; + + seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", + atomic_long_read(&rsp->expedited_start), + atomic_long_read(&rsp->expedited_done), + atomic_long_read(&rsp->expedited_wrap), + atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone1), + atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_normal), + atomic_long_read(&rsp->expedited_stoppedcpus), + atomic_long_read(&rsp->expedited_done_tries), + atomic_long_read(&rsp->expedited_done_lost), + atomic_long_read(&rsp->expedited_done_exit)); return 0; } -static int rcudata_csv_open(struct inode *inode, struct file *file) +static int rcuexp_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcudata_csv, NULL); + return single_open(file, show_rcuexp, inode->i_private); } -static const struct file_operations rcudata_csv_fops = { +static const struct file_operations rcuexp_fops = { .owner = THIS_MODULE, - .open = rcudata_csv_open, + .open = rcuexp_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; #ifdef CONFIG_RCU_BOOST @@ -257,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = { .owner = THIS_MODULE, .open = rcu_node_boost_open, .read = seq_read, - .llseek = seq_lseek, + .llseek = no_llseek, .release = single_release, }; -/* - * Create the rcuboost debugfs entry. Standard error return. - */ -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, - &rcu_node_boost_fops); -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return 0; /* There cannot be an error if we didn't create it! */ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_BOOST */ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { @@ -286,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", - rsp->name, rsp->completed, gpnum, rsp->fqs_state, + seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", + ulong2long(rsp->completed), ulong2long(gpnum), + rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", @@ -309,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); } -static int show_rcuhier(struct seq_file *m, void *unused) +static int show_rcuhier(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - print_one_rcu_state(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); return 0; } static int rcuhier_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcuhier, NULL); + return single_open(file, show_rcuhier, inode->i_private); } static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) @@ -341,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp = &rsp->node[0]; raw_spin_lock_irqsave(&rnp->lock, flags); - completed = rsp->completed; - gpnum = rsp->gpnum; - if (rsp->completed == rsp->gpnum) + completed = ACCESS_ONCE(rsp->completed); + gpnum = ACCESS_ONCE(rsp->gpnum); + if (completed == gpnum) gpage = 0; else gpage = jiffies - rsp->gp_start; gpmax = rsp->gp_max; raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", - rsp->name, completed, gpnum, gpage, gpmax); + seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", + ulong2long(completed), ulong2long(gpnum), gpage, gpmax); } -static int show_rcugp(struct seq_file *m, void *unused) +static int show_rcugp(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - show_one_rcugp(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); return 0; } static int rcugp_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcugp, NULL); + return single_open(file, show_rcugp, inode->i_private); } static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { + if (!rdp->beenonline) + return; seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', @@ -386,41 +366,36 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, - rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); } -static int show_rcu_pending(struct seq_file *m, void *unused) +static int show_rcu_pending(struct seq_file *m, void *v) { - int cpu; - struct rcu_data *rdp; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); - } - } + print_one_rcu_pending(m, (struct rcu_data *)v); return 0; } +static const struct seq_operations rcu_pending_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = show_rcu_pending, +}; + static int rcu_pending_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcu_pending, NULL); + return r_open(inode, file, &rcu_pending_op); } static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .llseek = no_llseek, + .release = seq_release, }; static int show_rcutorture(struct seq_file *m, void *unused) @@ -450,43 +425,58 @@ static struct dentry *rcudir; static int __init rcutree_trace_init(void) { + struct rcu_state *rsp; struct dentry *retval; + struct dentry *rspdir; rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) goto free_out; - retval = debugfs_create_file("rcubarrier", 0444, rcudir, - NULL, &rcubarrier_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcudata.csv", 0444, rcudir, - NULL, &rcudata_csv_fops); - if (!retval) - goto free_out; - - if (rcu_boost_trace_create_file(rcudir)) - goto free_out; + for_each_rcu_flavor(rsp) { + rspdir = debugfs_create_dir(rsp->name, rcudir); + if (!rspdir) + goto free_out; + + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &rcudata_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcuexp", 0444, + rspdir, rsp, &rcuexp_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &rcu_pending_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &rcubarrier_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!retval) - goto free_out; +#ifdef CONFIG_RCU_BOOST + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); + if (!retval) + goto free_out; + } +#endif - retval = debugfs_create_file("rcuhier", 0444, rcudir, - NULL, &rcuhier_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &rcugp_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, rcudir, - NULL, &rcu_pending_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &rcuhier_fops); + if (!retval) + goto free_out; + } retval = debugfs_create_file("rcutorture", 0444, rcudir, NULL, &rcutorture_fops); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..ff55247e7049 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) val = counter->usage; counter->usage -= val; + return counter->usage; } -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; + u64 ret = 0; local_irq_save(flags); for (c = counter; c != top; c = c->parent) { + u64 r; spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); + r = res_counter_uncharge_locked(c, val); + if (c == counter) + ret = r; spin_unlock(&c->lock); } local_irq_restore(flags); + return ret; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) { - res_counter_uncharge_until(counter, NULL, val); + return res_counter_uncharge_until(counter, NULL, val); } static inline unsigned long long * @@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf, *res = PAGE_ALIGN(*res); return 0; } - -int res_counter_write(struct res_counter *counter, int member, - const char *buf, write_strategy_fn write_strategy) -{ - char *end; - unsigned long flags; - unsigned long long tmp, *val; - - if (write_strategy) { - if (write_strategy(buf, &tmp)) - return -EINVAL; - } else { - tmp = simple_strtoull(buf, &end, 10); - if (*end != '\0') - return -EINVAL; - } - spin_lock_irqsave(&counter->lock, flags); - val = res_counter_member(counter, member); - *val = tmp; - spin_unlock_irqrestore(&counter->lock, flags); - return 0; -} diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *parent = root; struct resource *conflict; struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *next_res = NULL; if (!res) return; @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, res->end = end; res->flags = IORESOURCE_BUSY; - conflict = __request_resource(parent, res); - if (!conflict) - return; + while (1) { - /* failed, split and try again */ - kfree(res); + conflict = __request_resource(parent, res); + if (!conflict) { + if (!next_res) + break; + res = next_res; + next_res = NULL; + continue; + } - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; + /* conflict covered whole area */ + if (conflict->start <= res->start && + conflict->end >= res->end) { + kfree(res); + WARN_ON(next_res); + break; + } + + /* failed, split and try again */ + if (conflict->start > res->start) { + end = res->end; + res->end = conflict->start - 1; + if (conflict->end < end) { + next_res = kzalloc(sizeof(*next_res), + GFP_ATOMIC); + if (!next_res) { + kfree(res); + break; + } + next_res->name = name; + next_res->start = conflict->end + 1; + next_res->end = end; + next_res->flags = IORESOURCE_BUSY; + } + } else { + res->start = conflict->end + 1; + } + } - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); } void __init reserve_region_with_split(struct resource *root, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 649c9f876cb1..257002c13bb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -72,6 +72,7 @@ #include <linux/slab.h> #include <linux/init_task.h> #include <linux/binfmts.h> +#include <linux/context_tracking.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -505,7 +514,7 @@ static inline void init_hrtick(void) #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) @@ -740,126 +749,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -920,43 +809,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -1079,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1109,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -1518,25 +1387,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) smp_send_reschedule(cpu); } -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; - -} -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1597,21 +1447,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else + while (p->on_cpu) cpu_relax(); -#endif - } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ @@ -1713,6 +1550,15 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -1722,8 +1568,41 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ } +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; +} +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ + /* * fork()/clone()-time setup: */ @@ -1953,14 +1832,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; + vtime_task_switch(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2080,6 +1954,7 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2809,404 +2684,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); - - return (__force cputime_t) temp; -} - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) - utime = scale_utime(utime, rtime, total); - else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; -} -#endif - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3367,6 +2844,40 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { @@ -3468,6 +2979,21 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + user_exit(); + schedule(); + user_enter(); +} +#endif + /** * schedule_preempt_disabled - called with preemption disabled * @@ -3569,6 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); + user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -4570,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) @@ -4868,13 +4401,6 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } out: @@ -5022,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { unsigned long free = 0; + int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; @@ -5041,8 +4568,11 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), + task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); @@ -5416,16 +4946,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX; + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler, + bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } } static struct ctl_table * @@ -5437,30 +4976,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); + CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[12] is terminator */ return table; @@ -5604,7 +5143,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + case CPU_DEAD: calc_load_migrate(rq); break; #endif @@ -6537,7 +6078,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 0*SD_WAKE_AFFINE - | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE @@ -6660,6 +6200,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6714,11 +6265,68 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; } #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7167,6 +6775,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); @@ -7937,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7954,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) return &tg->css; } -static void cpu_cgroup_destroy(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -8314,8 +7923,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, + .css_alloc = cpu_cgroup_css_alloc, + .css_free = cpu_cgroup_css_free, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, @@ -8335,8 +7944,10 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ +struct cpuacct root_cpuacct; + /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) { struct cpuacct *ca; @@ -8366,7 +7977,7 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_destroy(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); @@ -8537,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, .subsys_id = cpuacct_subsys_id, .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..293b202fcf79 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,589 @@ +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/kernel_stat.h> +#include <linux/static_key.h> +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in vtime_account, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/vtime_account on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void irqtime_account_irq(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(irqtime_account_irq); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} + +void vtime_account_system_irqsafe(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account(struct task_struct *tsk) +{ + if (in_interrupt() || !is_idle_task(tsk)) + vtime_account_system(tsk); + else + vtime_account_idle(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, utime, total; + + utime = curr->utime; + total = utime + curr->stime; + + /* + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); + + if (total) + utime = scale_utime(utime, rtime, total); + else + utime = rtime; + + /* + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. + */ + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); +} + +/* + * Must be called with siglock held. + */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); +} +#endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); @@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); +#endif #undef PN #undef P } @@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..5eea8707234a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,9 @@ #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h> #include <trace/events/sched.h> @@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -597,7 +605,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) /* * The idea is to set a period in which each task runs once. * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * When there are too many tasks (sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl @@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms + */ +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static void task_numa_placement(struct task_struct *p) +{ + int seq; + + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages, bool migrated) +{ + struct task_struct *p = current; + + if (!sched_feat_numa(NUMA)) + return; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + /* + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes + */ + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); + + task_numa_placement(p); +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long start, end; + long pages; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + pages -= change_prot_numa(vma, start, end); + + start = end; + if (pages <= 0) + goto out; + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) { + unsigned int local_n; + + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + /* We don't use SRR here since we always want to round down. */ + return val >> 32; } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n) { + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable) +{ + u64 delta, periods; + u32 runnable_contrib; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + } +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; + } + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} +#endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); /* @@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_curr(cfs_rq); /* - * Update share accounting for long-running entities. + * Ensure that runnable average is periodically updated. */ - update_entity_shares_tick(cfs_rq); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); #ifdef CONFIG_SCHED_HRTICK /* @@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; } #endif @@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ + /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; cfs_rq->throttle_count++; return 0; @@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -2700,7 +3320,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int want_sd = 1; int sync = wake_flags & WF_SYNC; if (p->nr_cpus_allowed == 1) @@ -2718,48 +3337,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (nr_running < capacity) - want_sd = 0; - } - - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) break; + } - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) + if (tmp->flags & sd_flag) sd = tmp; } if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) prev_cpu = cpu; new_cpu = select_idle_sibling(p, prev_cpu); @@ -2809,6 +3401,37 @@ unlock: return new_cpu; } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } +} +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -2935,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL)) + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -3061,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp #ifdef CONFIG_SMP /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + * W_i,n/P_i == W_j,n/P_j for all i,j (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + * W_i,0 = \Sum_j w_i,j (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + * - infeasible weights; + * - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + * log_2 n 1 n + * \Sum { --- * --- * 2^i } = O(n) (5) + * i = 0 2^i 2^i + * `- size of each group + * | | `- number of cpus doing load-balance + * | `- freq + * `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + * log_2 n + * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) + * k = 0 + * + * And you'll find that: + * + * A^(log_2 n)_i,j != 0 for all i,j (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + * O(nm log n), n := nr_cpus, m := nr_tasks (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + * s_k,i + * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) + * S_k + * + * Where + * + * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + * rewrite all of this once again.] + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -3328,52 +4065,58 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - raw_spin_unlock_irqrestore(&rq->lock, flags); + update_cfs_rq_blocked_load(cfs_rq, 1); - return 0; + if (se) { + update_entity_load_avg(se, 1); + /* + * We pivot on our runnable average having decayed to zero for + * list removal. This generally implies that all our children + * have also been removed (modulo rounding error or bandwidth + * control); however, such cases are rare and we can fix these + * at enqueue. + * + * TODO: fix up out-of-order children on enqueue. + */ + if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) + list_del_leaf_cfs_rq(cfs_rq); + } else { + struct rq *rq = rq_of(cfs_rq); + update_rq_runnable_avg(rq, rq->nr_running); + } } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; - rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); } - rcu_read_unlock(); + + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -3425,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -4295,7 +5038,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == this_rq); + BUG_ON(busiest == env.dst_rq); schedstat_add(sd, lb_imbalance[idle], env.imbalance); @@ -4316,7 +5059,7 @@ redo: update_h_load(env.src_cpu); more_balance: local_irq_save(flags); - double_rq_lock(this_rq, busiest); + double_rq_lock(env.dst_rq, busiest); /* * cur_ld_moved - load moved in current iteration @@ -4324,7 +5067,7 @@ more_balance: */ cur_ld_moved = move_tasks(&env); ld_moved += cur_ld_moved; - double_rq_unlock(this_rq, busiest); + double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { @@ -4360,8 +5103,7 @@ more_balance: if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && lb_iterations++ < max_lb_iterations) { - this_rq = cpu_rq(env.new_dst_cpu); - env.dst_rq = this_rq; + env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; @@ -4486,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4646,7 +5390,7 @@ static void nohz_balancer_kick(int cpu) return; } -static inline void clear_nohz_tick_stopped(int cpu) +static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); @@ -4686,28 +5430,23 @@ void set_cpu_sd_state_idle(void) } /* - * This routine will record that this cpu is going idle with tick stopped. + * This routine will record that the cpu is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ -void select_nohz_load_balancer(int stop_tick) +void nohz_balance_enter_idle(int cpu) { - int cpu = smp_processor_id(); - /* * If this cpu is going down, then nothing needs to be done. */ if (!cpu_active(cpu)) return; - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + return; - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } - return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, @@ -4715,7 +5454,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DYING: - clear_nohz_tick_stopped(smp_processor_id()); + nohz_balance_exit_idle(smp_processor_id()); return NOTIFY_OK; default: return NOTIFY_DONE; @@ -4751,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -4837,14 +5576,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) if (need_resched()) break; - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_idle_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); + rq = cpu_rq(balance_cpu); + + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); - rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } @@ -4875,7 +5615,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - clear_nohz_tick_stopped(cpu); + nohz_balance_exit_idle(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -4987,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); + + update_rq_runnable_avg(rq, 1); } /* @@ -5079,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5125,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5161,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) @@ -5247,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; @@ -5352,7 +6123,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED + .migrate_task_rq = migrate_task_rq_fair, +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) SCHED_FEAT(START_DEBIT, true) /* - * Based on load and program behaviour, see if it makes sense to place - * a newly woken task on the same cpu as the task that woke it -- - * improve cache locality. Typically used with SYNC wakeups as - * generated by pipes and the like, see also SYNC_WAKEUPS. - */ -SCHED_FEAT(AFFINE_WAKEUPS, true) - -/* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. @@ -40,9 +32,14 @@ SCHED_FEAT(LAST_BUDDY, true) SCHED_FEAT(CACHE_HOT_BUDDY, true) /* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +/* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) @@ -69,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) +#endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..fc886441436a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,8 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; + atomic_t runnable_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -222,22 +224,29 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter, removed_load; + u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib; + u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * h_load = weight * f(tg) * @@ -245,26 +254,30 @@ struct cfs_rq { * this group. */ unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* - * Maintaining per-cpu shares distribution for group scheduling + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ @@ -467,6 +480,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) @@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; @@ -737,11 +764,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) */ next->on_cpu = 1; #endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else raw_spin_unlock(&rq->lock); -#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) @@ -755,9 +778,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->on_cpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); -#endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -891,6 +912,9 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { @@ -917,6 +941,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1156,3 +1190,52 @@ enum rq_nohz_flag_bits { #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; + struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, regs, -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, regs); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); goto skip; + } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; + if (syscall_get_nr(current, regs) < 0) + goto skip; /* Explicit request to skip. */ + return 0; case SECCOMP_RET_ALLOW: return 0; diff --git a/kernel/signal.c b/kernel/signal.c index be4f856d52f8..7aaa51d8e5b8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/tty.h> #include <linux/binfmts.h> +#include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> @@ -30,6 +31,7 @@ #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> +#include <linux/compat.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1158,8 +1160,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, return __send_signal(sig, info, t, group, from_ancestor_ns); } -static void print_fatal_signal(struct pt_regs *regs, int signr) +static void print_fatal_signal(int signr) { + struct pt_regs *regs = signal_pt_regs(); printk("%s/%d: potentially unexpected fatal signal %d.\n", current->comm, task_pid_nr(current), signr); @@ -1751,7 +1754,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); @@ -1907,7 +1910,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); - schedule(); + freezable_schedule(); } else { /* * By the time we got the lock, our tracer went away. @@ -1929,13 +1932,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) } /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. @@ -1971,13 +1967,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2096,7 +2087,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); + freezable_schedule(); return true; } else { /* @@ -2142,10 +2133,9 @@ static void do_jobctl_trap(void) } } -static int ptrace_signal(int signr, siginfo_t *info, - struct pt_regs *regs, void *cookie) +static int ptrace_signal(int signr, siginfo_t *info) { - ptrace_signal_deliver(regs, cookie); + ptrace_signal_deliver(); /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will @@ -2198,26 +2188,20 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); if (unlikely(uprobe_deny_signal())) return 0; -relock: /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. + * Do this once, we can't return to user-mode if freezing() == T. + * do_signal_stop() and ptrace_stop() do freezable_schedule() and + * thus do not need another check after return. */ try_to_freeze(); +relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if @@ -2274,8 +2258,7 @@ relock: break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); + signr = ptrace_signal(signr, info); if (!signr) continue; } @@ -2360,7 +2343,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(regs, info->si_signo); + print_fatal_signal(info->si_signo); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with @@ -2369,7 +2352,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info->si_signo, info->si_signo, regs); + do_coredump(info); } /* @@ -3112,6 +3095,79 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s out: return error; } +#ifdef CONFIG_GENERIC_SIGALTSTACK +SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) +{ + return do_sigaltstack(uss, uoss, current_user_stack_pointer()); +} +#endif + +int restore_altstack(const stack_t __user *uss) +{ + int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + /* squash all but EFAULT for now */ + return err == -EFAULT ? err : 0; +} + +int __save_altstack(stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} + +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_SIGALTSTACK +asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, + compat_stack_t __user *uoss_ptr) +{ + stack_t uss, uoss; + int ret; + mm_segment_t seg; + + if (uss_ptr) { + compat_stack_t uss32; + + memset(&uss, 0, sizeof(stack_t)); + if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) + return -EFAULT; + uss.ss_sp = compat_ptr(uss32.ss_sp); + uss.ss_flags = uss32.ss_flags; + uss.ss_size = uss32.ss_size; + } + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), + (stack_t __force __user *) &uoss, + compat_user_stack_pointer()); + set_fs(seg); + if (ret >= 0 && uoss_ptr) { + if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || + __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || + __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || + __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + ret = -EFAULT; + } + return ret; +} + +int compat_restore_altstack(const compat_stack_t __user *uss) +{ + int err = compat_sys_sigaltstack(uss, NULL); + /* squash all but -EFAULT for now */ + return err == -EFAULT ? err : 0; +} + +int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} +#endif +#endif #ifdef __ARCH_WANT_SYS_SIGPENDING diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1b..d6c5fc054242 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,14 +1,22 @@ /* * Common SMP CPU bringup/teardown functions */ +#include <linux/cpu.h> #include <linux/err.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/smpboot.h> #include "smpboot.h" +#ifdef CONFIG_SMP + #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -65,3 +73,228 @@ void __init idle_threads_init(void) } } #endif + +#endif /* #ifdef CONFIG_SMP */ + +static LIST_HEAD(hotplug_threads); +static DEFINE_MUTEX(smpboot_threads_lock); + +struct smpboot_thread_data { + unsigned int cpu; + unsigned int status; + struct smp_hotplug_thread *ht; +}; + +enum { + HP_THREAD_NONE = 0, + HP_THREAD_ACTIVE, + HP_THREAD_PARKED, +}; + +/** + * smpboot_thread_fn - percpu hotplug thread loop function + * @data: thread data pointer + * + * Checks for thread stop and park conditions. Calls the necessary + * setup, cleanup, park and unpark functions for the registered + * thread. + * + * Returns 1 when the thread should exit, 0 otherwise. + */ +static int smpboot_thread_fn(void *data) +{ + struct smpboot_thread_data *td = data; + struct smp_hotplug_thread *ht = td->ht; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + preempt_disable(); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->cleanup) + ht->cleanup(td->cpu, cpu_online(td->cpu)); + kfree(td); + return 0; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + preempt_enable(); + if (ht->park && td->status == HP_THREAD_ACTIVE) { + BUG_ON(td->cpu != smp_processor_id()); + ht->park(td->cpu); + td->status = HP_THREAD_PARKED; + } + kthread_parkme(); + /* We might have been woken for stop */ + continue; + } + + BUG_ON(td->cpu != smp_processor_id()); + + /* Check for state change setup */ + switch (td->status) { + case HP_THREAD_NONE: + preempt_enable(); + if (ht->setup) + ht->setup(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + case HP_THREAD_PARKED: + preempt_enable(); + if (ht->unpark) + ht->unpark(td->cpu); + td->status = HP_THREAD_ACTIVE; + preempt_disable(); + break; + } + + if (!ht->thread_should_run(td->cpu)) { + preempt_enable(); + schedule(); + } else { + set_current_state(TASK_RUNNING); + preempt_enable(); + ht->thread_fn(td->cpu); + } + } +} + +static int +__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + struct smpboot_thread_data *td; + + if (tsk) + return 0; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + if (!td) + return -ENOMEM; + td->cpu = cpu; + td->ht = ht; + + tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, + ht->thread_comm); + if (IS_ERR(tsk)) { + kfree(td); + return PTR_ERR(tsk); + } + + get_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = tsk; + return 0; +} + +int smpboot_create_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) { + ret = __smpboot_create_thread(cur, cpu); + if (ret) + break; + } + mutex_unlock(&smpboot_threads_lock); + return ret; +} + +static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + kthread_unpark(tsk); +} + +void smpboot_unpark_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry(cur, &hotplug_threads, list) + smpboot_unpark_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) +{ + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) + kthread_park(tsk); +} + +void smpboot_park_threads(unsigned int cpu) +{ + struct smp_hotplug_thread *cur; + + mutex_lock(&smpboot_threads_lock); + list_for_each_entry_reverse(cur, &hotplug_threads, list) + smpboot_park_thread(cur, cpu); + mutex_unlock(&smpboot_threads_lock); +} + +static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) +{ + unsigned int cpu; + + /* We need to destroy also the parked threads of offline cpus */ + for_each_possible_cpu(cpu) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + + if (tsk) { + kthread_stop(tsk); + put_task_struct(tsk); + *per_cpu_ptr(ht->store, cpu) = NULL; + } + } +} + +/** + * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Creates and starts the threads on all online cpus. + */ +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + unsigned int cpu; + int ret = 0; + + mutex_lock(&smpboot_threads_lock); + for_each_online_cpu(cpu) { + ret = __smpboot_create_thread(plug_thread, cpu); + if (ret) { + smpboot_destroy_threads(plug_thread); + goto out; + } + smpboot_unpark_thread(plug_thread, cpu); + } + list_add(&plug_thread->list, &hotplug_threads); +out: + mutex_unlock(&smpboot_threads_lock); + return ret; +} +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); + +/** + * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug + * @plug_thread: Hotplug thread descriptor + * + * Stops all threads on all possible cpus. + */ +void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) +{ + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + list_del(&plug_thread->list); + smpboot_destroy_threads(plug_thread); + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6ef9433e1c70..72415a0eb955 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { } static inline void idle_threads_init(void) { } #endif +int smpboot_create_threads(unsigned int cpu); +void smpboot_park_threads(unsigned int cpu); +void smpboot_unpark_threads(unsigned int cpu); + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/smpboot.h> #include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -220,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_system_vtime(current); + vtime_account_irq_enter(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -271,7 +272,7 @@ restart: lockdep_softirq_exit(); - account_system_vtime(current); + vtime_account_irq_exit(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -340,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - account_system_vtime(current); + vtime_account_irq_exit(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) @@ -742,49 +743,22 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int run_ksoftirqd(void * __bind_cpu) +static int ksoftirqd_should_run(unsigned int cpu) { - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - schedule_preempt_disabled(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - sched_preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; + return local_softirq_pending(); +} -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); +static void run_ksoftirqd(unsigned int cpu) +{ + local_irq_disable(); + if (local_softirq_pending()) { + __do_softirq(); + rcu_note_context_switch(cpu); + local_irq_enable(); + cond_resched(); + return; } - __set_current_state(TASK_RUNNING); - return 0; + local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU @@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); case CPU_DEAD: - case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); - takeover_tasklets(hotcpu); + case CPU_DEAD_FROZEN: + takeover_tasklets((unsigned long)hcpu); break; - } #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +static struct smp_hotplug_thread softirq_threads = { + .store = &ksoftirqd, + .thread_should_run = ksoftirqd_should_run, + .thread_fn = run_ksoftirqd, + .thread_comm = "ksoftirqd/%u", +}; + static __init int spawn_ksoftirqd(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d5..2b859828cdc3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -16,8 +16,10 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney <paulmck@us.ibm.com> + * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt @@ -34,6 +36,10 @@ #include <linux/delay.h> #include <linux/srcu.h> +#include <trace/events/rcu.h> + +#include "rcu.h" + /* * Initialize an rcu_batch structure to empty. */ @@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) } } -/* single-thread state-machine */ -static void process_srcu(struct work_struct *work); - static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->completed = 0; @@ -379,7 +382,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - queue_delayed_work(system_nrt_wq, &sp->work, 0); + schedule_delayed_work(&sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); + __synchronize_srcu(sp, rcu_expedited + ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT + : SYNCHRONIZE_SRCU_TRYCOUNT); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -631,13 +636,13 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); + schedule_delayed_work(&sp->work, SRCU_INTERVAL); } /* * This is the work-queue function that handles SRCU grace periods. */ -static void process_srcu(struct work_struct *work) +void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work) srcu_invoke_callbacks(sp); srcu_reschedule(sp); } +EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23eca..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + disable_nonboot_cpus(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -1045,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem); * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 */ -static int override_release(char __user *release, int len) +static int override_release(char __user *release, size_t len) { int ret = 0; - char buf[65]; if (current->personality & UNAME26) { - char *rest = UTS_RELEASE; + const char *rest = UTS_RELEASE; + char buf[65] = { 0 }; int ndots = 0; unsigned v; + size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) @@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - snprintf(buf, len, "2.6.%u%s", v, rest); - ret = copy_to_user(release, buf, len); + copy = clamp_t(size_t, len, 1, sizeof(buf)); + copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, copy + 1); } return ret; } @@ -1701,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1727,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; @@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct file *exe_file; + struct fd exe; struct dentry *dentry; int err; - exe_file = fget(fd); - if (!exe_file) + exe = fdget(fd); + if (!exe.file) return -EBADF; - dentry = exe_file->f_path.dentry; + dentry = exe.file->f_path.dentry; /* * Because the original mm->exe_file points to executable file, make @@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EACCES; if (!S_ISREG(dentry->d_inode->i_mode) || - exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; err = inode_permission(dentry->d_inode, MAY_EXEC); @@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_unlock; err = 0; - set_mm_exe_file(mm, exe_file); + set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ exit_unlock: up_write(&mm->mmap_sem); exit: - fput(exe_file); + fdput(exe); return err; } @@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void) return -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, NULL, argv_cleanup, NULL); if (ret == -ENOMEM) argv_free(argv); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); +cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); cond_syscall(sys_bind); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..c88878db491e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,12 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int max_threads; -extern int core_uses_pid; extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid; extern char core_pattern[]; extern unsigned int core_pipe_limit; +#endif extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -252,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -297,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -307,7 +314,7 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_migration_cost", + .procname = "sched_migration_cost_ns", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -321,14 +328,14 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "sched_time_avg", + .procname = "sched_time_avg_ms", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .procname = "sched_shares_window", + .procname = "sched_shares_window_ns", .data = &sysctl_sched_shares_window, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -343,7 +350,45 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, @@ -404,6 +449,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_COREDUMP { .procname = "core_uses_pid", .data = &core_uses_pid, @@ -425,6 +471,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -559,7 +606,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#ifdef CONFIG_HOTPLUG + { .procname = "hotplug", .data = &uevent_helper, @@ -567,7 +614,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, -#endif + #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", @@ -1543,8 +1590,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, @@ -2036,12 +2082,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { +#ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMPABLE_SAFE && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ "core dump path required.\n"); } +#endif } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, @@ -2053,6 +2101,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, return error; } +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2061,6 +2110,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, validate_coredump_safety(); return error; } +#endif static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) diff --git a/kernel/task_work.c b/kernel/task_work.c index d320d44903bd..65bd3c92d6f3 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -2,26 +2,20 @@ #include <linux/task_work.h> #include <linux/tracehook.h> +static struct callback_head work_exited; /* all we need is ->next == NULL */ + int -task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { - struct callback_head *last, *first; - unsigned long flags; + struct callback_head *head; - /* - * Not inserting the new work if the task has already passed - * exit_task_work() is the responisbility of callers. - */ - raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - first = last ? last->next : twork; - twork->next = first; - if (last) - last->next = twork; - task->task_works = twork; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + do { + head = ACCESS_ONCE(task->task_works); + if (unlikely(head == &work_exited)) + return -ESRCH; + work->next = head; + } while (cmpxchg(&task->task_works, head, work) != head); - /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ if (notify) set_notify_resume(task); return 0; @@ -30,52 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { + struct callback_head **pprev = &task->task_works; + struct callback_head *work = NULL; unsigned long flags; - struct callback_head *last, *res = NULL; - + /* + * If cmpxchg() fails we continue without updating pprev. + * Either we raced with task_work_add() which added the + * new entry before this work, we will find it again. Or + * we raced with task_work_run(), *pprev == NULL/exited. + */ raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - if (last) { - struct callback_head *q = last, *p = q->next; - while (1) { - if (p->func == func) { - q->next = p->next; - if (p == last) - task->task_works = q == p ? NULL : q; - res = p; - break; - } - if (p == last) - break; - q = p; - p = q->next; - } + while ((work = ACCESS_ONCE(*pprev))) { + read_barrier_depends(); + if (work->func != func) + pprev = &work->next; + else if (cmpxchg(pprev, work, work->next) == work) + break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return res; + + return work; } void task_work_run(void) { struct task_struct *task = current; - struct callback_head *p, *q; + struct callback_head *work, *head, *next; + + for (;;) { + /* + * work->func() can do task_work_add(), do not set + * work_exited unless the list is empty. + */ + do { + work = ACCESS_ONCE(task->task_works); + head = !work && (task->flags & PF_EXITING) ? + &work_exited : NULL; + } while (cmpxchg(&task->task_works, work, head) != work); - while (1) { - raw_spin_lock_irq(&task->pi_lock); - p = task->task_works; - task->task_works = NULL; - raw_spin_unlock_irq(&task->pi_lock); + if (!work) + break; + /* + * Synchronize with task_work_cancel(). It can't remove + * the first entry == work, cmpxchg(task_works) should + * fail, but it can play with *work and other entries. + */ + raw_spin_unlock_wait(&task->pi_lock); + smp_mb(); - if (unlikely(!p)) - return; + /* Reverse the list to run the works in fifo order */ + head = NULL; + do { + next = work->next; + work->next = head; + head = work; + work = next; + } while (work); - q = p->next; /* head */ - p->next = NULL; /* cut it */ - while (q) { - p = q->next; - q->func(q); - q = p; + work = head; + do { + next = work->next; + work->func(work); + work = next; cond_resched(); - } + } while (work); } } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -27,6 +27,7 @@ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> @@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static void fill_stats(struct task_struct *tsk, struct taskstats *stats) +static void fill_stats(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct task_struct *tsk, struct taskstats *stats) { memset(stats, 0, sizeof(*stats)); /* @@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats) stats->version = TASKSTATS_VERSION; stats->nvcsw = tsk->nvcsw; stats->nivcsw = tsk->nivcsw; - bacct_add_tsk(stats, tsk); + bacct_add_tsk(user_ns, pid_ns, stats, tsk); /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); @@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) rcu_read_unlock(); if (!tsk) return -ESRCH; - fill_stats(tsk, stats); + fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); put_task_struct(tsk); return 0; } @@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; + if (current_user_ns() != &init_user_ns) + return -EINVAL; + + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + if (isadd == REGISTER) { for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), @@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct nlattr *na; size_t size; u32 fd; - struct file *file; - int fput_needed; + struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { + nlmsg_free(rep_skb); rc = -EMSGSIZE; goto err; } @@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; @@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = send_reply(rep_skb, info); err: - fput_light(file, fput_needed); + fdput(f); return rc; } @@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, REGISTER); + rc = add_del_listener(info->snd_portid, mask, REGISTER); out: free_cpumask_var(mask); return rc; @@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); + rc = add_del_listener(info->snd_portid, mask, DEREGISTER); out: free_cpumask_var(mask); return rc; @@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (rc < 0) return; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, + task_pid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; - fill_stats(tsk, stats); + fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!is_thread_group || !group_dead) goto send; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, + task_tgid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; diff --git a/kernel/time.c b/kernel/time.c index ba744cf80696..d226c6a3fd28 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@ #include <linux/export.h> #include <linux/timex.h> #include <linux/capability.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b75..8601f0db1261 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA config GENERIC_TIME_VSYSCALL bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD + bool + # ktime_t scalar 64bit nsec representation config KTIME_SCALAR bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc8..f11d83b12949 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - struct hrtimer timer; ktime_t (*gettime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -46,6 +45,8 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; @@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { } * @base: pointer to the base where the timer is being run * @alarm: pointer to alarm being enqueued. * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + timerqueue_add(&base->timerqueue, &alarm->node); alarm->state |= ALARMTIMER_STATE_ENQUEUED; - - if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { - hrtimer_try_to_cancel(&base->timer); - hrtimer_start(&base->timer, alarm->node.expires, - HRTIMER_MODE_ABS); - } } /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) { - struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); - if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; timerqueue_del(&base->timerqueue, &alarm->node); alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - if (next == &alarm->node) { - hrtimer_try_to_cancel(&base->timer); - next = timerqueue_getnext(&base->timerqueue); - if (!next) - return; - hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); - } } @@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { - struct alarm_base *base = container_of(timer, struct alarm_base, timer); - struct timerqueue_node *next; + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - ktime_t now; int ret = HRTIMER_NORESTART; int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); - now = base->gettime(); - while ((next = timerqueue_getnext(&base->timerqueue))) { - struct alarm *alarm; - ktime_t expired = next->expires; - - if (expired.tv64 > now.tv64) - break; - - alarm = container_of(next, struct alarm, node); - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - alarm->state |= ALARMTIMER_STATE_CALLBACK; - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - restart = alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); - alarm->state &= ~ALARMTIMER_STATE_CALLBACK; + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - } - } + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); - if (next) { - hrtimer_set_expires(&base->timer, next->expires); + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); ret = HRTIMER_RESTART; } spin_unlock_irqrestore(&base->lock, flags); @@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev) unsigned long flags; struct rtc_device *rtc; int i; + int ret; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; @@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev) if (min.tv64 == 0) return 0; - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; } #else static int alarmtimer_suspend(struct device *dev) @@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @alarm: ptr to alarm to set * @start: time to run the alarm */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; + int ret; spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_active(alarm)) - alarmtimer_remove(base, alarm); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + return ret; } /** @@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret = -1; - spin_lock_irqsave(&base->lock, flags); - - if (alarmtimer_callback_running(alarm)) - goto out; + int ret; - if (alarmtimer_is_queued(alarm)) { - alarmtimer_remove(base, alarm); - ret = 1; - } else - ret = 0; -out: + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -802,10 +773,6 @@ static int __init alarmtimer_init(void) for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); - hrtimer_init(&alarm_bases[i].timer, - alarm_bases[i].base_clockid, - HRTIMER_MODE_ABS); - alarm_bases[i].timer.function = alarmtimer_fired; } error = alarmtimer_rtc_interface_setup(); @@ -821,6 +788,7 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } + ws = wakeup_source_register("alarmtimer"); return 0; out_drv: diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a851..30b6de0d977c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume) + dev->resume(dev); +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10b..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However @@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) return (cycle_t) jiffies; } -struct clocksource clocksource_jiffies = { +static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, @@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { @@ -74,9 +76,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); @@ -95,3 +97,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) { return &clocksource_jiffies; } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } update_process_times(user_mode(get_irq_regs())); @@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif extern void do_timer(unsigned long ticks); -extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3a9e5d5c1091..d58e552d9fd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock. */ static ktime_t last_jiffies_update; @@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta; /* - * Do a quick check without holding xtime_lock: + * Do a quick check without holding jiffies_lock: */ delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + /* Reevalute with jiffies_lock held */ + write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } /* @@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); return period; } + +static void tick_sched_do_timer(ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * jiffies_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + if (is_idle_task(current)) + ts->idle_jiffies++; + } +#endif + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} + /* * NOHZ - aka dynamic tick functionality */ @@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { @@ -372,7 +415,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - select_nohz_load_balancer(1); + nohz_balance_enter_idle(cpu); calc_load_enter_idle(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -436,7 +479,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; - if (ratelimit < 10) { + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; @@ -525,6 +569,8 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); __tick_nohz_idle_enter(ts); } @@ -569,7 +615,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ - select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); @@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); @@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); + tick_sched_do_timer(now); + tick_sched_handle(ts, regs); while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); @@ -794,7 +813,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { @@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - if (idle_cpu(cpu)) - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } + if (regs) + tick_sched_handle(ts, regs); hrtimer_forward(timer, now, tick_period); @@ -874,7 +863,7 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - /* Offset the tick to avert xtime_lock contention. */ + /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { u64 offset = ktime_to_ns(tick_period) >> 1; do_div(offset, num_possible_cpus()); diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly <patrick.ohly@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/timecompare.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/math64.h> -#include <linux/kernel.h> - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp) -{ - u64 nsec; - - nsec = source_tstamp + sync->offset; - nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / - TIMECOMPARE_SKEW_RESOLUTION; - - return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp) -{ - u64 start_source = 0, end_source = 0; - struct { - s64 offset; - s64 duration_target; - } buffer[10], sample, *samples; - int counter = 0, i; - int used; - int index; - int num_samples = sync->num_samples; - - if (num_samples > ARRAY_SIZE(buffer)) { - samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); - if (!samples) { - samples = buffer; - num_samples = ARRAY_SIZE(buffer); - } - } else { - samples = buffer; - } - - /* run until we have enough valid samples, but do not try forever */ - i = 0; - counter = 0; - while (1) { - u64 ts; - ktime_t start, end; - - start = sync->target(); - ts = timecounter_read(sync->source); - end = sync->target(); - - if (!i) - start_source = ts; - - /* ignore negative durations */ - sample.duration_target = ktime_to_ns(ktime_sub(end, start)); - if (sample.duration_target >= 0) { - /* - * assume symetric delay to and from source: - * average target time corresponds to measured - * source time - */ - sample.offset = - (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - - ts; - - /* simple insertion sort based on duration */ - index = counter - 1; - while (index >= 0) { - if (samples[index].duration_target < - sample.duration_target) - break; - samples[index + 1] = samples[index]; - index--; - } - samples[index + 1] = sample; - counter++; - } - - i++; - if (counter >= num_samples || i >= 100000) { - end_source = ts; - break; - } - } - - *source_tstamp = (end_source + start_source) / 2; - - /* remove outliers by only using 75% of the samples */ - used = counter * 3 / 4; - if (!used) - used = counter; - if (used) { - /* calculate average */ - s64 off = 0; - for (index = 0; index < used; index++) - off += samples[index].offset; - *offset = div_s64(off, used); - } - - if (samples && samples != buffer) - kfree(samples); - - return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - s64 offset; - u64 average_time; - - if (!timecompare_offset(sync, &offset, &average_time)) - return; - - if (!sync->last_update) { - sync->last_update = average_time; - sync->offset = offset; - sync->skew = 0; - } else { - s64 delta_nsec = average_time - sync->last_update; - - /* avoid division by negative or small deltas */ - if (delta_nsec >= 10000) { - s64 delta_offset_nsec = offset - sync->offset; - s64 skew; /* delta_offset_nsec * - TIMECOMPARE_SKEW_RESOLUTION / - delta_nsec */ - u64 divisor; - - /* div_s64() is limited to 32 bit divisor */ - skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; - divisor = delta_nsec; - while (unlikely(divisor >= ((s64)1) << 32)) { - /* divide both by 2; beware, right shift - of negative value has undefined - behavior and can only be used for - the positive divisor */ - skew = div_s64(skew, 2); - divisor >>= 1; - } - skew = div_s64(skew, divisor); - - /* - * Calculate new overall skew as 4/16 the - * old value and 12/16 the new one. This is - * a rather arbitrary tradeoff between - * only using the latest measurement (0/16 and - * 16/16) and even more weight on past measurements. - */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 - sync->skew = - div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * - sync->skew + - TIMECOMPARE_NEW_SKEW_PER_16 * skew, - 16); - sync->last_update = average_time; - sync->offset = offset; - } - } -} -EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cecd..cbc6acb0db3f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@ * */ +#include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -20,71 +21,11 @@ #include <linux/time.h> #include <linux/tick.h> #include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* NTP adjusted clock multiplier */ - u32 mult; - /* The shift value of the current clocksource. */ - u32 shift; - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Current CLOCK_REALTIME time in seconds */ - u64 xtime_sec; - /* Clock shifted nano seconds */ - u64 xtime_nsec; - - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - u32 ntp_error_shift; - - /* - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the - * monotonic time not to jump. We need to add total_sleep_time to - * wall_to_monotonic to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ - struct timespec wall_to_monotonic; - /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - /* Seqlock for all timekeeper values */ - seqlock_t lock; -}; static struct timekeeper timekeeper; -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -96,15 +37,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static struct timespec tk_xtime(struct timekeeper *tk) -{ - struct timespec ts; - - ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - return ts; -} - static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; @@ -243,17 +175,63 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + /* update timekeeping data */ + update_pvclock_gtod(tk); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { - struct timespec xt; - if (clearntp) { tk->ntp_error = 0; ntp_clear(); } - xt = tk_xtime(tk); - update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall(tk); + update_pvclock_gtod(tk); } /** @@ -776,6 +754,7 @@ static void timekeeping_resume(void) read_persistent_clock(&ts); + clockevents_resume(); clocksource_resume(); write_seqlock_irqsave(&tk->lock, flags); @@ -835,6 +814,7 @@ static int timekeeping_suspend(void) clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); + clockevents_suspend(); return 0; } @@ -1111,7 +1091,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = tk->raw_interval << shift; + raw_nsecs = (u64)tk->raw_interval << shift; raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; @@ -1128,6 +1108,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1139,7 +1146,6 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; - s64 remainder; write_seqlock_irqsave(&tk->lock, flags); @@ -1181,20 +1187,11 @@ static void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(tk, offset); - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; + * XXX This can be killed once everyone converts + * to the new update_vsyscall. + */ + old_vsyscall_fixup(tk); /* * Finally, make sure that after the rounding @@ -1346,9 +1343,7 @@ struct timespec get_monotonic_coarse(void) } /* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... + * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { @@ -1436,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ void xtime_update(unsigned long ticks) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); do_timer(ticks); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } diff --git a/kernel/timer.c b/kernel/timer.c index 8c5e7b908c68..367d00858482 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { struct list_head vec[TVN_SIZE]; @@ -92,24 +93,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); + return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); } -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) { - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); + return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); } -static inline void timer_set_deferrable(struct timer_list *timer) +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) { - timer->base = TBASE_MAKE_DEFERRED(timer->base); + return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); } static inline void timer_set_base(struct timer_list *timer, struct tvec_base *new_base) { - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; + + timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); } static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -358,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; + if (idx > MAX_TVAL) { + idx = MAX_TVAL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; @@ -563,16 +566,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer) debug_object_assert_init(timer, &timer_debug_descr); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key); +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); @@ -613,12 +614,13 @@ static inline void debug_assert_init(struct timer_list *timer) debug_timer_assert_init(timer); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { + struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); + timer->base = (void *)((unsigned long)base | flags); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -628,22 +630,10 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } -void setup_deferrable_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key, - void (*function)(unsigned long), - unsigned long data) -{ - timer->function = function; - timer->data = data; - init_timer_on_stack_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); - /** * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies @@ -651,24 +641,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_init(timer); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL(init_timer_key); -void init_timer_deferrable_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - init_timer_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable_key); - static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -686,7 +666,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) - timer->base->active_timers--; + base->active_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -697,7 +677,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, detach_timer(timer, clear_pending); if (!tbase_get_deferrable(timer->base)) { - timer->base->active_timers--; + base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } @@ -1029,14 +1009,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. * - * Note: You must not hold locks that are held in interrupt context - * while calling this function. Even if the lock has nothing to do - * with the timer in question. Here's why: + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why: * * CPU0 CPU1 * ---- ---- @@ -1073,7 +1053,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq()); + WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1180,19 +1160,27 @@ static inline void __run_timers(struct tvec_base *base) while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; + bool irqsafe; timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; + irqsafe = tbase_get_irqsafe(timer->base); timer_stats_account_timer(timer); base->running_timer = timer; detach_expired_timer(timer, base); - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); + if (irqsafe) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } } } base->running_timer = NULL; @@ -1791,9 +1779,13 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); + int err; + + /* ensure there are enough low bits for flags in timer->base pointer */ + BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); + err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); init_timer_stats(); BUG_ON(err != NOTIFY_OK); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..5d89335a485f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + config HAVE_C_RECORDMCOUNT bool help @@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT config TRACER_MAX_TRACE bool +config TRACE_CLOCK + bool + config RING_BUFFER bool + select TRACE_CLOCK config FTRACE_NMI_ENTER bool @@ -109,6 +118,8 @@ config TRACING select NOP_TRACER select BINARY_PRINTF select EVENT_TRACING + select TRACE_CLOCK + select IRQ_WORK config GENERIC_TRACER bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +endif # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES @@ -17,11 +19,7 @@ endif CFLAGS_trace_events_filter.o := -I$(src) -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..3ffe4c5ad3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -10,7 +10,7 @@ * Based on code in the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/stop_machine.h> @@ -64,12 +64,20 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif + +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; +} /* * Traverse the ftrace_global_list, invoking all entries. The reason that we @@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) +static void +ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip); + ftrace_pid_function(ip, parent_ip, op, regs); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) -{ - if (function_trace_stop) - return; - - __ftrace_trace_function(ip, parent_ip); -} -#endif - static void control_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -230,28 +243,27 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && + !FTRACE_FORCE_LIST_FUNC)) { + /* Set the ftrace_ops that the arch callback uses */ + if (ftrace_ops_list == &global_ops) + function_trace_op = ftrace_global_list; + else + function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; - else + } else { + /* Just use the default ftrace_ops */ + function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; + } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = - (func == ftrace_stub) ? func : ftrace_test_stop_func; -#endif } static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) @@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; +#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) } static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0); + function_profile_call(trace->func, 0, NULL, NULL); return 1; } @@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int register_ftrace_profiler(void) @@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static DEFINE_MUTEX(ftrace_regex_lock); @@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags++; if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) return; + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) return; @@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; + /* + * If enabling and the REGS flag does not match the REGS_EN, then + * do not ignore this record. Set flags to fail the compare against + * ENABLED. + */ + if (flag && + (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) + flag |= FTRACE_FL_REGS; + /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) return FTRACE_UPDATE_IGNORE; if (flag) { - if (update) + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, if the EN flag is set, then return + * UPDATE_MODIFY_CALL_REGS to tell the caller to convert + * from the non-save regs, to a save regs function. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + else if (rec->flags & FTRACE_FL_REGS_EN) + return FTRACE_UPDATE_MODIFY_CALL_REGS; + else + return FTRACE_UPDATE_MODIFY_CALL; } - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; + if (update) { + /* If there's no more users, clear all flags */ + if (!(rec->flags & ~FTRACE_FL_MASK)) + rec->flags = 0; + else + /* Just disable the record (keep REGS state) */ + rec->flags &= ~FTRACE_FL_ENABLED; + } return FTRACE_UPDATE_MAKE_NOP; } @@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + unsigned long ftrace_old_addr; unsigned long ftrace_addr; int ret; - ftrace_addr = (unsigned long)FTRACE_ADDR; - ret = ftrace_update_record(rec, enable); + if (rec->flags & FTRACE_FL_REGS) + ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; + else + ftrace_addr = (unsigned long)FTRACE_ADDR; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; @@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); + + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + if (rec->flags & FTRACE_FL_REGS) + ftrace_old_addr = (unsigned long)FTRACE_ADDR; + else + ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; + + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } return -1; /* unknow ftrace bug */ @@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); @@ -2358,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, " (%ld)%s", + rec->flags & ~FTRACE_FL_MASK, + rec->flags & FTRACE_FL_REGS ? " R" : ""); seq_printf(m, "\n"); return 0; @@ -2595,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 1; @@ -2788,10 +2868,10 @@ static int __init ftrace_mod_cmd_init(void) { return register_ftrace_command(&ftrace_mod_cmd); } -device_initcall(ftrace_mod_cmd_init); +core_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, return ret; } +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @ops - the ops to set the filter with @@ -3912,6 +4047,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int __init ftrace_nodyn_init(void) @@ -3919,7 +4055,7 @@ static int __init ftrace_nodyn_init(void) ftrace_enabled = 1; return 0; } -device_initcall(ftrace_nodyn_init); +core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } @@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; @@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + if (function_trace_stop) + return; + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) return; @@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. + * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); +} +#endif + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -4215,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, if (strlen(tmp) == 0) return 1; - ret = strict_strtol(tmp, 10, &val); + ret = kstrtol(tmp, 10, &val); if (ret < 0) return ret; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..ce8514feedcd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -460,9 +460,10 @@ struct ring_buffer_per_cpu { unsigned long lost_events; unsigned long last_overrun; local_t entries_bytes; - local_t commit_overrun; - local_t overrun; local_t entries; + local_t overrun; + local_t commit_overrun; + local_t dropped_events; local_t committing; local_t commits; unsigned long read; @@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head_page_with_bit; head_page = &rb_set_head_page(cpu_buffer)->list; + if (!head_page) + break; prev_page = head_page->prev; first_page = pages->next; @@ -1567,6 +1570,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, put_online_cpus(); } else { + /* Make sure this CPU has been intitialized */ + if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) + goto out; + cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) @@ -1816,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) } /** - * ring_buffer_update_event - update event type and data + * rb_update_event - update event type and data * @event: the even to update * @type: the type of event * @length: the size of the event field in the ring buffer @@ -2151,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are not in overwrite mode, * this is easy, just stop here. */ - if (!(buffer->flags & RB_FL_OVERWRITE)) + if (!(buffer->flags & RB_FL_OVERWRITE)) { + local_inc(&cpu_buffer->dropped_events); goto out_reset; + } ret = rb_handle_head_page(cpu_buffer, tail_page, @@ -2716,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); * and not the length of the event which would hold the header. */ int ring_buffer_write(struct ring_buffer *buffer, - unsigned long length, - void *data) + unsigned long length, + void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -2816,7 +2825,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as - * it works like an on/off switch, where as the disable() verison + * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct ring_buffer *buffer) @@ -2839,7 +2848,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as - * it works like an on/off switch, where as the enable() verison + * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct ring_buffer *buffer) @@ -2925,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ -unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long ret; + u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; @@ -2945,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); - ret = bpage->page->time_stamp; + if (bpage) + ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; @@ -2991,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** - * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -3011,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** - * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -3032,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->dropped_events); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -3256,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); + if (!reader) + goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -3774,12 +3811,17 @@ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; /* * Ring buffer is disabled from recording, here's a good place - * to check the integrity of the ring buffer. + * to check the integrity of the ring buffer. + * Must prevent readers from trying to read, as the check + * clears the HEAD page and readers require it. */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->buffer->resize_disabled); @@ -3860,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..e5125677efa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9,7 +9,7 @@ * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <generated/utsrelease.h> @@ -19,6 +19,7 @@ #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> +#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) } /* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + +/* + * When a reader is waiting for data, then this variable is + * set to true. + */ +static bool trace_wakeup_needed; + +static struct irq_work trace_work_wakeup; + +/* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets @@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_options __initdata; + +static int __init set_trace_boot_options(char *str) +{ + strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_boot_options = trace_boot_options_buf; + return 0; +} +__setup("trace_options=", set_trace_boot_options); + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -198,20 +226,9 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); -/* tracer_enabled is used to toggle activation of a tracer */ -static int tracer_enabled = 1; - -/** - * tracing_is_enabled - return tracer_enabled status - * - * This function is used by other tracers to know the status - * of the tracer_enabled flag. Tracers may use this function - * to know if it should enable their features when starting - * up. See irqsoff tracer for an example (start_irqsoff_tracer). - */ int tracing_is_enabled(void) { - return tracer_enabled; + return tracing_is_on(); } /* @@ -328,17 +345,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); -static void wakeup_work_handler(struct work_struct *work) +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. + */ +static void trace_wake_up(struct irq_work *work) { - wake_up(&trace_wait); -} + wake_up_all(&trace_wait); -static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); +} /** * tracing_on - enable tracing buffers @@ -393,22 +416,6 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -void trace_wake_up(void) -{ - const unsigned long delay = msecs_to_jiffies(2); - - if (trace_flags & TRACE_ITER_BLOCK) - return; - schedule_delayed_work(&wakeup_work, delay); -} - static int __init set_buf_size(char *str) { unsigned long buf_size; @@ -426,15 +433,15 @@ __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { - unsigned long threshhold; + unsigned long threshold; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &threshhold); + ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; - tracing_thresh = threshhold * 1000; + tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); @@ -470,16 +477,19 @@ static const char *trace_options[] = { "overwrite", "disable_on_free", "irq-info", + "markers", NULL }; static struct { u64 (*func)(void); const char *name; + int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local" }, - { trace_clock_global, "global" }, - { trace_clock_counter, "counter" }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + ARCH_TRACE_CLOCKS }; int trace_clock_id; @@ -756,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ +static void default_wait_pipe(struct trace_iterator *iter) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + trace_wakeup_needed = true; + + if (trace_empty(iter)) + schedule(); + + finish_wait(&trace_wait, &wait); +} + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -874,32 +918,6 @@ int register_tracer(struct tracer *type) return ret; } -void unregister_tracer(struct tracer *type) -{ - struct tracer **t; - - mutex_lock(&trace_types_lock); - for (t = &trace_types; *t; t = &(*t)->next) { - if (*t == type) - goto found; - } - pr_info("Tracer %s not registered\n", type->name); - goto out; - - found: - *t = (*t)->next; - - if (type == current_trace && tracer_enabled) { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(&global_trace); - current_trace = &nop_trace; - } -out: - mutex_unlock(&trace_types_lock); -} - void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; @@ -1130,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[]) void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || - !tracing_is_on()) + if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + return; + + if (!__this_cpu_read(trace_cmdline_save)) return; + __this_cpu_write(trace_cmdline_save, false); + trace_save_cmdline(tsk); } @@ -1177,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + if (trace_wakeup_needed) { + trace_wakeup_needed = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&trace_work_wakeup); + } + ring_buffer_unlock_commit(buffer, event); +} + static inline void __trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) + unsigned long flags, int pc) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); - - if (wake) - trace_wake_up(); } void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); } +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, @@ -1214,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); -} -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); - -void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs) +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -1268,7 +1291,7 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void @@ -1361,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -1457,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -1558,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void) return -ENOMEM; } +static int buffers_allocated; + void trace_printk_init_buffers(void) { - static int buffers_allocated; - if (buffers_allocated) return; @@ -1570,7 +1593,38 @@ void trace_printk_init_buffers(void) pr_info("ftrace: Allocated trace_printk buffers\n"); + /* Expand the buffers to set size */ + tracing_update_buffers(); + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. If the global_trace.buffer is already + * allocated here, then this was called by module code. + */ + if (global_trace.buffer) + tracing_start_cmdline_record(); +} + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); } /** @@ -1621,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -1692,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } out: @@ -2060,7 +2114,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); @@ -2424,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[trace_clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + /* stop the trace while dumping */ tracing_stop(); @@ -2792,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_OVERWRITE) ring_buffer_change_overwrite(global_trace.buffer, enabled); + + if (mask == TRACE_ITER_PRINTK) + trace_printk_start_stop_comm(enabled); } -static ssize_t -tracing_trace_options_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int trace_set_options(char *option) { - char buf[64]; char *cmp; int neg = 0; - int ret; + int ret = 0; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + cmp = strstrip(option); if (strncmp(cmp, "no", 2) == 0) { neg = 1; @@ -2830,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); ret = set_tracer_option(current_trace, cmp, neg); mutex_unlock(&trace_types_lock); - if (ret) - return ret; } + return ret; +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + trace_set_options(buf); + *ppos += cnt; return cnt; @@ -2938,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = { }; static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - r = sprintf(buf, "%u\n", tracer_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - val = !!val; - - mutex_lock(&trace_types_lock); - if (tracer_enabled ^ val) { - - /* Only need to warn if this is used to change the state */ - WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); - - if (val) { - tracer_enabled = 1; - if (current_trace->start) - current_trace->start(tr); - tracing_start(); - } else { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(tr); - } - } - mutex_unlock(&trace_types_lock); - - *ppos += cnt; - - return cnt; -} - -static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3017,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) tr->data[cpu]->entries = val; } +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_array *tr, + struct trace_array *size_tr, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu]->entries, cpu); + if (ret < 0) + break; + tr->data[cpu]->entries = size_tr->data[cpu]->entries; + } + } else { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu_id]->entries, cpu_id); + if (ret == 0) + tr->data[cpu_id]->entries = + size_tr->data[cpu_id]->entries; + } + + return ret; +} + static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -3028,6 +3070,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) */ ring_buffer_expanded = 1; + /* May be called before buffers are initialized */ + if (!global_trace.buffer) + return 0; + ret = ring_buffer_resize(global_trace.buffer, size, cpu); if (ret < 0) return ret; @@ -3037,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r = 0; - - if (cpu == RING_BUFFER_ALL_CPUS) { - int i; - for_each_tracing_cpu(i) { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[i]->entries, - i); - if (r < 0) - break; - } - } else { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[cpu]->entries, - cpu); - } - + int r = resize_buffer_duplicate_size(&global_trace, + &global_trace, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3191,17 +3222,11 @@ static int tracing_set_tracer(const char *buf) topts = create_trace_option_files(t); if (t->use_max_tr) { - int cpu; /* we need to make per cpu buffer sizes equivalent */ - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(max_tr.buffer, - global_trace.data[cpu]->entries, - cpu); - if (ret < 0) - goto out; - max_tr.data[cpu]->entries = - global_trace.data[cpu]->entries; - } + ret = resize_buffer_duplicate_size(&max_tr, &global_trace, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; } if (t->init) { @@ -3323,6 +3348,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[trace_clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); @@ -3383,19 +3412,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } } - -void default_wait_pipe(struct trace_iterator *iter) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - - if (trace_empty(iter)) - schedule(); - - finish_wait(&trace_wait, &wait); -} - /* * This is a make-shift waitqueue. * A tracer might use this callback on some rare cases: @@ -3436,7 +3452,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is disabled. + * We block until we read something and tracing is enabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3444,7 +3460,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracer_enabled && iter->pos) + if (tracing_is_enabled() && iter->pos) break; } @@ -3886,6 +3902,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; @@ -3950,7 +3969,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); written = cnt; @@ -4011,6 +4030,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (max_tr.buffer) ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + /* + * New clock may not be consistent with the previous clock. + * Reset the buffer so that it doesn't have incomparable timestamps. + */ + tracing_reset_online_cpus(&global_trace); + if (max_tr.buffer) + tracing_reset_online_cpus(&max_tr); + mutex_unlock(&trace_types_lock); *fpos += cnt; @@ -4032,13 +4059,6 @@ static const struct file_operations tracing_max_lat_fops = { .llseek = generic_file_llseek, }; -static const struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, - .llseek = generic_file_llseek, -}; - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, @@ -4195,12 +4215,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4216,7 +4230,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; @@ -4261,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -ENOMEM; if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); ret = -EINVAL; goto out; } if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); if (len < PAGE_SIZE) { ret = -EINVAL; goto out; @@ -4378,13 +4390,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + if (trace_clocks[trace_clock_id].in_ns) { + /* local or global for trace_clock */ + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", + t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + } else { + /* counter or tsc mode for trace_clock */ + trace_seq_printf(s, "oldest event ts: %llu\n", + ring_buffer_oldest_event_ts(tr->buffer, cpu)); + + trace_seq_printf(s, "now ts: %llu\n", + ring_buffer_time_stamp(tr->buffer, cpu)); + } - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "dropped events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4816,9 +4842,6 @@ static __init int tracer_init_debugfs(void) d_tracer = tracing_init_dentry(); - trace_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - trace_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); @@ -5090,6 +5113,7 @@ __init static int tracer_alloc_buffers(void) /* Only allocate trace_printk buffers if a trace_printk exists */ if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ @@ -5137,6 +5161,7 @@ __init static int tracer_alloc_buffers(void) #endif trace_init_cmdlines(); + init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); current_trace = &nop_trace; @@ -5148,6 +5173,13 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + while (trace_boot_options) { + char *option; + + option = strsep(&trace_boot_options, ","); + trace_set_options(option); + } + return 0; out_free_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..c75d7988902c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -147,7 +147,7 @@ struct trace_array_cpu { unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; - uid_t uid; + kuid_t uid; char comm[TASK_COMM_LEN]; }; @@ -285,8 +285,8 @@ struct tracer { int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; struct tracer_flags *flags; - int print_max; - int use_max_tr; + bool print_max; + bool use_max_tr; }; @@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); void tracing_reset_current(int cpu); @@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long len, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +void __buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); @@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); void ftrace(struct trace_array *tr, @@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); -void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; extern cpumask_var_t __read_mostly tracing_buffer_mask; @@ -472,11 +465,11 @@ extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -#endif extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; @@ -680,6 +673,7 @@ enum trace_iterator_flags { TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, }; /* @@ -840,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); +void trace_printk_start_comm(void); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5f..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); @@ -199,7 +199,7 @@ __init static int init_branch_tracer(void) } return register_tracer(&branch_trace); } -device_initcall(init_branch_tracer); +core_initcall(init_branch_tracer); #else static inline diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a6d2ee2086c..84b1e045faba 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void -perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..880073d0b946 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int -ftrace_event_seq_open(struct inode *inode, struct file *file) -{ - const struct seq_operations *seq_ops; - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_clear_events(); - - seq_ops = inode->i_private; - return seq_open(file, seq_ops); -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = { }; static const struct file_operations ftrace_avail_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, @@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void) return d_events; } +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_event_seq_ops; + + return seq_open(file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_event_seq_ops; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(); + + return seq_open(file, seq_ops); +} + static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1199,6 +1209,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } +static void event_remove(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + + if (WARN_ON(!call->name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", + call->name); + } + + return ret; +} + static int __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, const struct file_operations *id, @@ -1209,19 +1244,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, struct dentry *d_events; int ret; - /* The linker may leave blanks */ - if (!call->name) - return -EINVAL; - - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } - } + ret = event_init(call); + if (ret < 0) + return ret; d_events = event_trace_events_dir(); if (!d_events) @@ -1272,13 +1297,10 @@ static void remove_subsystem_dir(const char *name) */ static void __trace_remove_event_call(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); + event_remove(call); trace_destroy_fields(call); destroy_preds(call); + debugfs_remove_recursive(call->dir); remove_subsystem_dir(call->class->system); } @@ -1450,30 +1472,59 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static __init int event_trace_enable(void) +{ + struct ftrace_event_call **iter, *call; + char *buf = bootup_event_buf; + char *token; + int ret; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } + + trace_printk_start_comm(); + + return 0; +} + static __init int event_trace_init(void) { - struct ftrace_event_call **call; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; int ret; - char *buf = bootup_event_buf; - char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, - (void *)&show_event_seq_ops, - &ftrace_avail_fops); + NULL, &ftrace_avail_fops); if (!entry) pr_warning("Could not create debugfs " "'available_events' entry\n"); entry = debugfs_create_file("set_event", 0644, d_tracer, - (void *)&show_set_event_seq_ops, - &ftrace_set_event_fops); + NULL, &ftrace_set_event_fops); if (!entry) pr_warning("Could not create debugfs " "'set_event' entry\n"); @@ -1497,24 +1548,19 @@ static __init int event_trace_init(void) if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, + /* + * Early initialization already enabled ftrace event. + * Now it's only necessary to create the event directory. + */ + list_for_each_entry(call, &ftrace_events, list) { + + ret = event_create_dir(call, d_events, + &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - } - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(token, 1); - if (ret) - pr_warning("Failed to enable trace event: %s\n", token); + if (ret < 0) + event_remove(call); } ret = register_module_notifier(&trace_module_nb); @@ -1523,6 +1569,7 @@ static __init int event_trace_init(void) return 0; } +core_initcall(event_trace_enable); fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1646,9 +1693,11 @@ static __init void event_trace_self_tests(void) event_test_stuff(); ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) + if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); + continue; + } pr_cont("OK\n"); } @@ -1681,7 +1730,8 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -1710,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1720,6 +1770,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..e5b0ca8b8d4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps, } } else { if (field->is_signed) - ret = strict_strtoll(pred->regex.pattern, 0, &val); + ret = kstrtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->regex.pattern, 0, &val); + ret = kstrtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, static int __ftrace_function_set_filter(int filter, char *buf, int len, struct function_filter_data *data) { - int i, re_cnt, ret; + int i, re_cnt, ret = -EINVAL; int *reset; char **re; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a426f410c060..8e3ad8082ab7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -7,13 +7,12 @@ * Based on code from the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -49,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) } static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -75,16 +75,17 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -/* Our two options */ +/* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, - TRACE_FUNC_OPT_PSTORE = 0x2, }; static struct tracer_flags func_flags; static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) + { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -106,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - /* - * So far tracing doesn't support multiple buffers, so - * we make an explicit call for now. - */ - if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) - pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -121,7 +116,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -164,22 +160,19 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif -#ifdef CONFIG_PSTORE_FTRACE - { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, -#endif { } /* Always set a last empty entry */ }; @@ -232,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) } break; - case TRACE_FUNC_OPT_PSTORE: - break; default: return -EINVAL; } @@ -375,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, * We use the callback data field (which is a pointer) * as our counter. */ - ret = strict_strtoul(number, 0, (unsigned long *)&count); + ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; @@ -420,5 +411,4 @@ static __init int init_function_trace(void) init_func_cmd_traceon(); return register_tracer(&function_trace); } -device_initcall(init_function_trace); - +core_initcall(init_function_trace); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * * Currently, x86_32 with optimize for size (-Os) makes the latest * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. */ if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); @@ -220,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); return 1; } @@ -324,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -1471,4 +1474,4 @@ static __init int init_graph_trace(void) return register_tracer(&graph_trace); } -device_initcall(init_graph_trace); +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..713a2cac4881 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -7,7 +7,7 @@ * From code in the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/kallsyms.h> #include <linux/debugfs.h> @@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, * irqsoff uses its own tracer function to keep the overhead down: */ static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; @@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ @@ -603,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -613,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -636,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -646,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -671,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -681,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_preemptirqsoff(trace) register_tracer(&trace) @@ -697,4 +698,4 @@ __init static int init_irqsoff_tracer(void) return 0; } -device_initcall(init_irqsoff_tracer); +core_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a2117043bb1..1865d5f76538 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); + ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; @@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 123b189c732c..194d79602dc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh = 100; +static unsigned long preempt_mark_thresh_us = 100; static int -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, - rel_usecs > preempt_mark_thresh ? '!' : - rel_usecs > 1 ? '+' : ' '); + unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; + unsigned long long abs_ts = iter->ts - iter->tr->time_start; + unsigned long long rel_ts = next_ts - iter->ts; + struct trace_seq *s = &iter->seq; + + if (in_ns) { + abs_ts = ns2usecs(abs_ts); + rel_ts = ns2usecs(rel_ts); + } + + if (verbose && in_ns) { + unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); + unsigned long abs_msec = (unsigned long)abs_ts; + unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); + unsigned long rel_msec = (unsigned long)rel_ts; + + return trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { + return trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { + return trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + rel_ts > preempt_mark_thresh_us ? '!' : + rel_ts > 1 ? '+' : ' '); + } else { /* !verbose && !in_ns */ + return trace_seq_printf(s, " %4lld: ", abs_ts); + } } int trace_print_context(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned long secs = (unsigned long)t; + unsigned long long t; + unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; int ret; @@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter) return 0; } - return trace_seq_printf(s, " %5lu.%06lu: ", - secs, usec_rem); + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(iter->ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + } else + return trace_seq_printf(s, " %12llu: ", iter->ts); } int trace_print_lat_context(struct trace_iterator *iter) @@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter) *next_entry = trace_find_next_entry(iter, NULL, &next_ts); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); - unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - unsigned long rel_usecs; /* Restore the original ent_size */ iter->ent_size = ent_size; if (!next_entry) next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); if (verbose) { char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" - " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx, - ns2usecs(iter->ts), - abs_usecs / USEC_PER_MSEC, - abs_usecs % USEC_PER_MSEC, - rel_usecs / USEC_PER_MSEC, - rel_usecs % USEC_PER_MSEC); + ret = trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { ret = lat_print_generic(s, entry, iter->cpu); - if (ret) - ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } + if (ret) + ret = lat_print_timestamp(iter, next_ts); + return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daa9980153af..412e959709b4 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type) goto fail; type++; - if (strict_strtoul(type, 0, &bs)) + if (kstrtoul(type, 0, &bs)) goto fail; switch (bs) { @@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) tmp = strchr(symbol, '+'); if (tmp) { - /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtoul(tmp + 1, 0, offset); + /* skip sign because kstrtoul doesn't accept '+' */ + ret = kstrtoul(tmp + 1, 0, offset); if (ret) return ret; @@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, else ret = -EINVAL; } else if (isdigit(arg[5])) { - ret = strict_strtoul(arg + 5, 10, ¶m); + ret = kstrtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { @@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '@': /* memory or symbol */ if (isdigit(arg[1])) { - ret = strict_strtoul(arg + 1, 0, ¶m); + ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; @@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, break; case '+': /* deref memory */ - arg++; /* Skip '+', because strict_strtol() rejects it. */ + arg++; /* Skip '+', because kstrtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg, 0, &offset); + ret = kstrtol(arg, 0, &offset); if (ret) break; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a18456..3374c792ccd8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr->buffer, flags, 6, pc); - ftrace_trace_userstack(tr->buffer, flags, pc); + trace_buffer_unlock_commit(buffer, event, flags, pc); } static void diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..9fe45fcefca0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -7,7 +7,7 @@ * Based on code from the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/module.h> #include <linux/fs.h> @@ -108,7 +108,8 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; @@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ @@ -588,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, @@ -598,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -609,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, @@ -619,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; __init static int init_wakeup_tracer(void) @@ -636,4 +637,4 @@ __init static int init_wakeup_tracer(void) return 0; } -device_initcall(init_wakeup_tracer); +core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..47623169a815 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe1_cnt++; } static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe2_cnt++; } static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe3_cnt++; } static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_global_cnt++; } static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_dyn_cnt++; } static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -307,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; char *func_name; int ret; @@ -318,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing, and record the filter function */ ftrace_enabled = 1; - tracer_enabled = 1; /* passed in by parameter to fool gcc from optimizing */ func(); @@ -382,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ ftrace_set_global_filter(NULL, 0, 1); @@ -393,10 +403,247 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + trace_selftest_recursion_cnt++; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + int cnt; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + /* + * If arch supports all ftrace features, and no other task + * was on the list, we should be fine. + */ + if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) + cnt = 2; /* Should have recursed */ + else + cnt = 1; + + ret = -1; + if (trace_selftest_recursion_cnt != cnt) { + pr_cont("*callback not called expected %d times (%d)* ", + cnt, trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -406,7 +653,6 @@ int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; int ret; @@ -415,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* start the tracing */ ftrace_enabled = 1; - tracer_enabled = 1; ret = tracer_init(trace, tr); if (ret) { @@ -442,10 +687,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_startup_dynamic_tracing(trace, tr, DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + + ret = trace_selftest_function_recursion(); + if (ret) + goto out; + ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* kill ftrace totally if we failed */ if (ret) @@ -778,6 +1029,8 @@ static int trace_wakeup_test_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); schedule(); + complete(x); + /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* @@ -821,29 +1074,27 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tracing_max_latency = 0; - /* sleep to let the RT thread sleep too */ - msleep(100); + while (p->on_rq) { + /* + * Sleep to make sure the RT thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ + init_completion(&isrt); wake_up_process(p); - /* give a little time to let the thread wake up */ - msleep(100); + /* Wait for the task to wake up */ + wait_for_completion(&isrt); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); + printk("ret = %d\n", ret); if (!ret) ret = trace_test_buffer(&max_tr, &count); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..42ca822fc701 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -33,7 +33,6 @@ static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); static DEFINE_MUTEX(stack_sysctl_mutex); @@ -111,13 +110,11 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { int cpu; - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - preempt_disable_notrace(); cpu = raw_smp_processor_id(); @@ -136,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..7609dd6714c2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event, static int syscall_exit_register(struct ftrace_event_call *event, enum trace_reg type, void *data); -static int syscall_enter_define_fields(struct ftrace_event_call *call); -static int syscall_exit_define_fields(struct ftrace_event_call *call); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, -}; - -struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, -}; - -struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, -}; - -struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), - .raw_init = init_syscall_trace, -}; - extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; @@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int init_syscall_trace(struct ftrace_event_call *call) +static int init_syscall_trace(struct ftrace_event_call *call) { int id; int num; @@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; @@ -487,7 +484,7 @@ int __init init_ftrace_syscalls(void) return 0; } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS @@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) return ret; } -void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct ftrace_event_call *call) { int num; @@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) return ret; } -void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct ftrace_event_call *call) { int num; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..c86e6d4f67fb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -22,6 +22,7 @@ #include <linux/uaccess.h> #include <linux/uprobes.h> #include <linux/namei.h> +#include <linux/string.h> #include "trace_probe.h" @@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv) if (argv[0][0] == '-') is_delete = true; else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); + pr_info("Probe definition must be started with 'p' or '-'.\n"); return -EINVAL; } @@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = strict_strtoul(arg, 0, &offset); + ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; @@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) /* setup a probe */ if (!event) { - char *tail = strrchr(filename, '/'); + char *tail; char *ptr; - ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); - if (!ptr) { + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { ret = -ENOMEM; goto fail_address_parse; } - tail = ptr; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebdd..625df0b44690 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -26,7 +26,9 @@ /* * fill in basic accounting fields */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +void bacct_add_tsk(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; struct timespec uptime, ts; @@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_pid = tsk->pid; + stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); - stats->ac_uid = tcred->uid; - stats->ac_gid = tcred->gid; + stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); + stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); stats->ac_ppid = pid_alive(tsk) ? - rcu_dereference(tsk->real_parent)->tgid : 0; + task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); stats->ac_utime = cputime_to_usecs(tsk->utime); stats->ac_stime = cputime_to_usecs(tsk->stime); diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76f..33acb5e53a5f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/proc_fs.h> /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -38,11 +39,20 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, + .projid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, .kref = { .refcount = ATOMIC_INIT(3), }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422d..2b042c42fbc4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/proc_fs.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> @@ -19,12 +20,31 @@ #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> +#include <linux/projid.h> static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) +{ + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + cred->securebits = SECUREBITS_DEFAULT; + cred->cap_inheritable = CAP_EMPTY_SET; + cred->cap_permitted = CAP_FULL_SET; + cred->cap_effective = CAP_FULL_SET; + cred->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(cred->request_key_auth); + cred->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + cred->user_ns = user_ns; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -38,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -51,38 +72,45 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); + /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - new->securebits = SECUREBITS_DEFAULT; - new->cap_inheritable = CAP_EMPTY_SET; - new->cap_permitted = CAP_FULL_SET; - new->cap_effective = CAP_FULL_SET; - new->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* Leave the new->user_ns reference with the new user namespace. */ - /* Leave the reference to our user_ns with the new cred. */ - new->user_ns = ns; + set_cred_user_ns(new, ns); return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -295,6 +323,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) } EXPORT_SYMBOL(from_kgid_munged); +/** + * make_kprojid - Map a user-namespace projid pair into a kprojid. + * @ns: User namespace that the projid is in + * @projid: Project identifier + * + * Maps a user-namespace uid pair into a kernel internal kuid, + * and returns that kuid. + * + * When there is no mapping defined for the user-namespace projid + * pair INVALID_PROJID is returned. Callers are expected to test + * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * may be tested for using projid_valid(). + */ +kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) +{ + /* Map the uid to a global kernel uid */ + return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); +} +EXPORT_SYMBOL(make_kprojid); + +/** + * from_kprojid - Create a projid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal project identifier to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kprojid has no mapping in @targ (projid_t)-1 is returned. + */ +projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) +{ + /* Map the uid from a global kernel uid */ + return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); +} +EXPORT_SYMBOL(from_kprojid); + +/** + * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal projid to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kprojid from_kprojid_munged never fails and always + * returns a valid projid. This makes from_kprojid_munged + * appropriate for use in syscalls like stat and where + * failing the system call and failing to provide a valid projid are + * not an options. + * + * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. + */ +projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) +{ + projid_t projid; + projid = from_kprojid(targ, kprojid); + + if (projid == (projid_t) -1) + projid = OVERFLOW_PROJID; + return projid; +} +EXPORT_SYMBOL(from_kprojid_munged); + + static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; @@ -302,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -323,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -337,6 +434,27 @@ static int gid_m_show(struct seq_file *seq, void *v) return 0; } +static int projid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + projid_t lower; + + lower_ns = seq_user_ns(seq); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; @@ -362,6 +480,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos) return m_start(seq, ppos, &ns->gid_map); } +static void *projid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->projid_map); +} + static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; @@ -387,6 +512,13 @@ struct seq_operations proc_gid_seq_operations = { .show = gid_m_show, }; +struct seq_operations proc_projid_seq_operations = { + .start = projid_m_start, + .stop = m_stop, + .next = m_next, + .show = projid_m_show, +}; + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -434,7 +566,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Require the appropriate privilege CAP_SETUID or CAP_SETGID * over the user namespace in order to set the id mapping. */ - if (!ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) goto out; /* Get a buffer */ @@ -564,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -576,17 +712,57 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); + + if (!ns->parent) + return -EPERM; + + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + + /* Anyone can set any valid project id no capability needed */ + return map_write(file, buf, size, ppos, -1, + &ns->projid_map, &ns->parent->projid_map); +} + static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow mapping to your own filesystem ids */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); + if (uid_eq(uid, current_fsuid())) + return true; + } + else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (gid_eq(gid, current_fsgid())) + return true; + } + } + + /* Allow anyone to set a mapping that doesn't require privilege */ + if (!cap_valid(cap_setid)) + return true; + /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. */ @@ -596,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } +static void *userns_get(struct task_struct *task) +{ + struct user_namespace *user_ns; + + rcu_read_lock(); + user_ns = get_user_ns(__task_cred(task)->user_ns); + rcu_read_unlock(); + + return user_ns; +} + +static void userns_put(void *ns) +{ + put_user_ns(ns); +} + +static int userns_install(struct nsproxy *nsproxy, void *ns) +{ + struct user_namespace *user_ns = ns; + struct cred *cred; + + /* Don't allow gaining capabilities by reentering + * the same user namespace. + */ + if (user_ns == current_user_ns()) + return -EINVAL; + + /* Threaded processes may not enter a different user namespace */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + put_user_ns(cred->user_ns); + set_cred_user_ns(cred, get_user_ns(user_ns)); + + return commit_creds(cred); +} + +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + +const struct proc_ns_operations userns_operations = { + .name = "user", + .type = CLONE_NEWUSER, + .get = userns_get, + .put = userns_put, + .install = userns_install, + .inum = userns_inum, +}; + static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3fd..08b197e8c485 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; } @@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(tsk, old_ns); + new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; @@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -102,19 +109,32 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *ns) +static int utsns_install(struct nsproxy *nsproxy, void *new) { + struct uts_namespace *ns = new; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/kernel/wait.c b/kernel/wait.c index 7fdd9eaca2c3..6698e0c04ead 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -1,7 +1,7 @@ /* * Generic waiting primitives. * - * (C) 2004 William Irwin, Oracle + * (C) 2004 Nadia Yvette Chambers, Oracle */ #include <linux/init.h> #include <linux/export.h> diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..75a2ab3d0b02 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -22,6 +22,7 @@ #include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/smpboot.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -29,16 +30,19 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_disabled; +static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif @@ -113,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static unsigned long get_sample_period(void) +static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -122,7 +126,7 @@ static unsigned long get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * (NSEC_PER_SEC / 5); + sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -248,13 +252,15 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -#else -static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +static int watchdog_nmi_enable(unsigned int cpu); +static void watchdog_nmi_disable(unsigned int cpu); /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) @@ -270,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) wake_up_process(__this_cpu_read(softlockup_watchdog)); /* .. and repeat */ - hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); if (touch_ts == 0) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { @@ -327,49 +333,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static void watchdog_set_prio(unsigned int policy, unsigned int prio) +{ + struct sched_param param = { .sched_priority = prio }; -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) + sched_setscheduler(current, policy, ¶m); +} + +static void watchdog_enable(unsigned int cpu) { - struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - /* initialize timestamp */ - __touch_watchdog(); - /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + if (!watchdog_enabled) { + kthread_park(current); + return; + } + + /* Enable the perf event */ + watchdog_nmi_enable(cpu); + /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly (kicked by the hrtimer callback function) once every - * get_sample_period() seconds (4 seconds by default) to reset the - * softlockup timestamp. If this gets delayed for more than - * 2*watchdog_thresh seconds then the debug-printout triggers in - * watchdog_timer_fn(). - */ - while (!kthread_should_stop()) { - __touch_watchdog(); - schedule(); + /* initialize timestamp */ + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + __touch_watchdog(); +} - if (kthread_should_stop()) - break; +static void watchdog_disable(unsigned int cpu) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - set_current_state(TASK_INTERRUPTIBLE); - } - /* - * Drop the policy/priority elevation during thread exit to avoid a - * scheduling latency spike. - */ - __set_current_state(TASK_RUNNING); - sched_setscheduler(current, SCHED_NORMAL, ¶m); - return 0; + watchdog_set_prio(SCHED_NORMAL, 0); + hrtimer_cancel(hrtimer); + /* disable the perf event */ + watchdog_nmi_disable(cpu); +} + +static int watchdog_should_run(unsigned int cpu) +{ + return __this_cpu_read(hrtimer_interrupts) != + __this_cpu_read(soft_lockup_hrtimer_cnt); } +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every sample_period seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static void watchdog(unsigned int cpu) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); +} #ifdef CONFIG_HARDLOCKUP_DETECTOR /* @@ -379,7 +404,7 @@ static int watchdog(void *unused) */ static unsigned long cpu0_err; -static int watchdog_nmi_enable(int cpu) +static int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -433,7 +458,7 @@ out: return 0; } -static void watchdog_nmi_disable(int cpu) +static void watchdog_nmi_disable(unsigned int cpu) { struct perf_event *event = per_cpu(watchdog_ev, cpu); @@ -447,107 +472,35 @@ static void watchdog_nmi_disable(int cpu) return; } #else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } +static int watchdog_nmi_enable(unsigned int cpu) { return 0; } +static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - pr_err("softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - /* sysctl functions */ #ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - pr_err("failed to be enabled on some cpus\n"); + unsigned int cpu; + if (watchdog_disabled) { + watchdog_disabled = 0; + for_each_online_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } } static void watchdog_disable_all_cpus(void) { - int cpu; - - for_each_online_cpu(cpu) - watchdog_disable(cpu); + unsigned int cpu; - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; + if (!watchdog_disabled) { + watchdog_disabled = 1; + for_each_online_cpu(cpu) + kthread_park(per_cpu(softlockup_watchdog, cpu)); + } } - /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -557,73 +510,38 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int ret; + if (watchdog_disabled < 0) + return -ENODEV; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) - goto out; + return ret; + set_sample_period(); if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); -out: return ret; } #endif /* CONFIG_SYSCTL */ - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .park = watchdog_disable, + .unpark = watchdog_enable, }; void __init lockup_detector_init(void) { - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; + set_sample_period(); + if (smpboot_register_percpu_thread(&watchdog_threads)) { + pr_err("Failed to create watchdog threads, disabled\n"); + watchdog_disabled = -ENODEV; + } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c5a79e2134c..fbc6576a83c3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -58,7 +58,7 @@ enum { * be executing on any CPU. The gcwq behaves as an unbound one. * * Note that DISASSOCIATED can be flipped only while holding - * managership of all pools on the gcwq to avoid changing binding + * assoc_mutex of all pools on the gcwq to avoid changing binding * state while create_worker() is in progress. */ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ @@ -73,11 +73,10 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ @@ -126,7 +125,6 @@ enum { struct global_cwq; struct worker_pool; -struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -150,7 +148,6 @@ struct worker { int id; /* I: worker id */ /* for rebinding worker to CPU */ - struct idle_rebind *idle_rebind; /* L: for idle worker */ struct work_struct rebind_work; /* L: for busy worker */ }; @@ -160,13 +157,15 @@ struct worker_pool { struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ + + /* nr_idle includes the ones off idle_list for rebinding */ int nr_idle; /* L: currently idle ones */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - struct mutex manager_mutex; /* mutex manager should hold */ + struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ }; @@ -184,9 +183,8 @@ struct global_cwq { struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct worker_pool pools[2]; /* normal and highpri pools */ - - wait_queue_head_t rebind_hold; /* rebind hold wait */ + struct worker_pool pools[NR_WORKER_POOLS]; + /* normal and highpri pools */ } ____cacheline_aligned_in_smp; /* @@ -269,17 +267,15 @@ struct workqueue_struct { }; struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); +struct workqueue_struct *system_highpri_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_highpri_wq); +struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); -EXPORT_SYMBOL_GPL(system_nrt_wq); +struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -534,18 +530,24 @@ static int work_next_color(int color) } /* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue. Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. + * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data + * contain the pointer to the queued cwq. Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and CPU number. * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data. These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. + * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, cpu or clear + * work->data. These functions should only be called while the work is + * owned - ie. while the PENDING bit is set. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work. gcwq is available once the work has been - * queued anywhere after initialization. cwq is available only from - * queueing until execution starts. + * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to + * a work. gcwq is available once the work has been queued anywhere after + * initialization until it is sync canceled. cwq is available only while + * the work item is queued. + * + * %WORK_OFFQ_CANCELING is used to mark a work item which is being + * canceled. While being canceled, a work item may have its PENDING set + * but stay off timer and worklist for arbitrarily long and nobody should + * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) @@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } -static void set_work_cpu(struct work_struct *work, unsigned int cpu) +static void set_work_cpu_and_clear_pending(struct work_struct *work, + unsigned int cpu) { - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); + /* + * The following wmb is paired with the implied mb in + * test_and_set_bit(PENDING) and ensures all updates to @work made + * here are visible to and precede any updates by the next PENDING + * owner. + */ + smp_wmb(); + set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { + smp_wmb(); /* see set_work_cpu_and_clear_pending() */ set_work_data(work, WORK_STRUCT_NO_CPU, 0); } @@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return ((struct cpu_workqueue_struct *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; - cpu = data >> WORK_STRUCT_FLAG_BITS; + cpu = data >> WORK_OFFQ_CPU_SHIFT; if (cpu == WORK_CPU_NONE) return NULL; @@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return get_gcwq(cpu); } +static void mark_work_canceling(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + + set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, + WORK_STRUCT_PENDING); +} + +static bool work_is_canceling(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +} + /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; + /* + * nr_idle and idle_list may disagree if idle rebinding is in + * progress. Never return %true if idle_list is empty. + */ + if (list_empty(&pool->idle_list)) + return false; + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -705,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) { + WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); atomic_inc(get_pool_nr_running(worker->pool)); + } } /** @@ -903,6 +939,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_delayed_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + + trace_workqueue_activate_work(work); + move_linked_works(work, &cwq->pool->worklist, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + + cwq_activate_delayed_work(work); +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * try_to_grab_pending - steal work item from worklist and disable irq + * @work: work item to steal + * @is_dwork: @work is a delayed_work + * @flags: place to store irq state + * + * Try to grab PENDING bit of @work. This function can handle @work in any + * stable state - idle, on timer or on worklist. Return values are + * + * 1 if @work was pending and we successfully stole PENDING + * 0 if @work was idle and we claimed PENDING + * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry + * -ENOENT if someone else is canceling @work, this state may persist + * for arbitrarily long + * + * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting + * interrupted while holding PENDING and @work off queue, irq must be + * disabled on entry. This, combined with delayed_work->timer being + * irqsafe, ensures that we return -EAGAIN for finite short period of time. + * + * On successful return, >= 0, irq is disabled and the caller is + * responsible for releasing it using local_irq_restore(*@flags). + * + * This function is safe to call from any context including IRQ handler. + */ +static int try_to_grab_pending(struct work_struct *work, bool is_dwork, + unsigned long *flags) +{ + struct global_cwq *gcwq; + + local_irq_save(*flags); + + /* try to steal the timer if it exists */ + if (is_dwork) { + struct delayed_work *dwork = to_delayed_work(work); + + /* + * dwork->timer is irqsafe. If del_timer() fails, it's + * guaranteed that the timer is not queued anywhere and not + * running on the local CPU. + */ + if (likely(del_timer(&dwork->timer))) + return 1; + } + + /* try to claim PENDING the normal way */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + gcwq = get_work_gcwq(work); + if (!gcwq) + goto fail; + + spin_lock(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong gcwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); + + /* + * A delayed work item cannot be grabbed directly + * because it might have linked NO_COLOR work items + * which, if left on the delayed_list, will confuse + * cwq->nr_active management later on and cause + * stall. Make sure the work item is activated + * before grabbing. + */ + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + cwq_activate_delayed_work(work); + + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work)); + + spin_unlock(&gcwq->lock); + return 1; + } + } + spin_unlock(&gcwq->lock); +fail: + local_irq_restore(*flags); + if (work_is_canceling(work)) + return -ENOENT; + cpu_relax(); + return -EAGAIN; +} + +/** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to * @work: work to insert @@ -982,7 +1218,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; - unsigned long flags; + unsigned int req_cpu = cpu; + + /* + * While a work item is PENDING && off queue, a task trying to + * steal the PENDING will busy-loop waiting for it to either get + * queued or lose PENDING. Grabbing PENDING and queueing should + * happen with IRQ disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); debug_work_activate(work); @@ -995,21 +1239,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; - if (unlikely(cpu == WORK_CPU_UNBOUND)) + if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); /* - * It's multi cpu. If @wq is non-reentrant and @work - * was previously on a different cpu, it might still - * be running there, in which case the work needs to - * be queued on that cpu to guarantee non-reentrance. + * It's multi cpu. If @work was previously on a different + * cpu, it might still be running there, in which case the + * work needs to be queued on that cpu to guarantee + * non-reentrancy. */ gcwq = get_gcwq(cpu); - if (wq->flags & WQ_NON_REENTRANT && - (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + last_gcwq = get_work_gcwq(work); + + if (last_gcwq && last_gcwq != gcwq) { struct worker *worker; - spin_lock_irqsave(&last_gcwq->lock, flags); + spin_lock(&last_gcwq->lock); worker = find_worker_executing_work(last_gcwq, work); @@ -1017,22 +1262,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, gcwq = last_gcwq; else { /* meh... not running there, queue here */ - spin_unlock_irqrestore(&last_gcwq->lock, flags); - spin_lock_irqsave(&gcwq->lock, flags); + spin_unlock(&last_gcwq->lock); + spin_lock(&gcwq->lock); } - } else - spin_lock_irqsave(&gcwq->lock, flags); + } else { + spin_lock(&gcwq->lock); + } } else { gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock(&gcwq->lock); } /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); + trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); return; } @@ -1050,134 +1296,220 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); } /** - * queue_work - queue work on a workqueue + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) { - int ret; + bool ret = false; + unsigned long flags; - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = true; + } + local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work); +EXPORT_SYMBOL_GPL(queue_work_on); /** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on + * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; - } - return ret; + return queue_work_on(WORK_CPU_UNBOUND, wq, work); } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); + /* should have been called from irqsafe timer with irq already off */ + __queue_work(dwork->cpu, cwq->wq, &dwork->work); +} +EXPORT_SYMBOL_GPL(delayed_work_timer_fn); + +static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + unsigned int lcpu; + + WARN_ON_ONCE(timer->function != delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); + + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + __queue_work(cpu, wq, &dwork->work); + return; + } + + timer_stats_timer_set_start_info(&dwork->timer); + + /* + * This stores cwq for the moment, for the timer_fn. Note that the + * work's gcwq is preserved to allow reentrance detection for + * delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + /* + * If we cannot get the last gcwq from @work directly, + * select the last CPU such that it avoids unnecessarily + * triggering non-reentrancy check in __queue_work(). + */ + lcpu = cpu; + if (gcwq) + lcpu = gcwq->cpu; + if (lcpu == WORK_CPU_UNBOUND) + lcpu = raw_smp_processor_id(); + } else { + lcpu = WORK_CPU_UNBOUND; + } + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + + dwork->cpu = cpu; + timer->expires = jiffies + delay; + + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns %false if @work was already on a queue, %true otherwise. If + * @delay is zero and @dwork is idle, it will be scheduled for immediate + * execution. + */ +bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + struct work_struct *work = &dwork->work; + bool ret = false; + unsigned long flags; + + /* read the comment in __queue_work() */ + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_delayed_work(cpu, wq, dwork, delay); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); + +/** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ -int queue_delayed_work(struct workqueue_struct *wq, +bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - if (delay == 0) - return queue_work(wq, &dwork->work); - - return queue_delayed_work_on(-1, wq, dwork, delay); + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } EXPORT_SYMBOL_GPL(queue_delayed_work); /** - * queue_delayed_work_on - queue work on specific CPU after delay + * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is + * zero, @work is guaranteed to be scheduled immediately regardless of its + * current state. + * + * Returns %false if @dwork was idle and queued, %true if @dwork was + * pending and its timer was modified. + * + * This function is safe to call from any context including IRQ handler. + * See try_to_grab_pending() for details. */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) { - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - unsigned int lcpu; - - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); - - /* - * This stores cwq for the moment, for the timer_fn. - * Note that the work's gcwq is preserved to allow - * reentrance detection for delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); - - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) - lcpu = gcwq->cpu; - else - lcpu = raw_smp_processor_id(); - } else - lcpu = WORK_CPU_UNBOUND; - - set_work_cwq(work, get_cwq(lcpu, wq), 0); + unsigned long flags; + int ret; - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; + if (likely(ret >= 0)) { + __queue_delayed_work(cpu, wq, dwork, delay); + local_irq_restore(flags); } + + /* -ENOENT from try_to_grab_pending() becomes %true */ return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL_GPL(mod_delayed_work_on); + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(mod_delayed_work); /** * worker_enter_idle - enter idle state @@ -1305,37 +1637,21 @@ __acquires(&gcwq->lock) } } -struct idle_rebind { - int cnt; /* # workers to be rebound */ - struct completion done; /* all workers rebound */ -}; - /* - * Rebind an idle @worker to its CPU. During CPU onlining, this has to - * happen synchronously for idle workers. worker_thread() will test - * %WORKER_REBIND before leaving idle and call this function. + * Rebind an idle @worker to its CPU. worker_thread() will test + * list_empty(@worker->entry) before leaving idle and call this function. */ static void idle_worker_rebind(struct worker *worker) { struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU must be online at this point */ - WARN_ON(!worker_maybe_bind_and_lock(worker)); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); + /* CPU may go down again inbetween, clear UNBOUND only on success */ + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); - /* we did our part, wait for rebind_workers() to finish up */ - wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); - - /* - * rebind_workers() shouldn't finish until all workers passed the - * above WORKER_REBIND wait. Tell it when done. - */ - spin_lock_irq(&worker->pool->gcwq->lock); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); + /* rebind complete, become available again */ + list_add(&worker->entry, &worker->pool->idle_list); + spin_unlock_irq(&gcwq->lock); } /* @@ -1349,16 +1665,8 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - worker_maybe_bind_and_lock(worker); - - /* - * %WORKER_REBIND must be cleared even if the above binding failed; - * otherwise, we may confuse the next CPU_UP cycle or oops / get - * stuck by calling idle_worker_rebind() prematurely. If CPU went - * down again inbetween, %WORKER_UNBOUND would be set, so clearing - * %WORKER_REBIND is always safe. - */ - worker_clr_flags(worker, WORKER_REBIND); + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&gcwq->lock); } @@ -1370,123 +1678,74 @@ static void busy_worker_rebind_fn(struct work_struct *work) * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding * is different for idle and busy ones. * - * The idle ones should be rebound synchronously and idle rebinding should - * be complete before any worker starts executing work items with - * concurrency management enabled; otherwise, scheduler may oops trying to - * wake up non-local idle worker from wq_worker_sleeping(). + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. * - * This is achieved by repeatedly requesting rebinding until all idle - * workers are known to have been rebound under @gcwq->lock and holding all - * idle workers from becoming busy until idle rebinding is complete. + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. * - * Once idle workers are rebound, busy workers can be rebound as they - * finish executing their current work items. Queueing the rebind work at - * the head of their scheduled lists is enough. Note that nr_running will - * be properbly bumped as busy workers rebind. - * - * On return, all workers are guaranteed to either be bound or have rebind - * work item scheduled. + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. */ static void rebind_workers(struct global_cwq *gcwq) - __releases(&gcwq->lock) __acquires(&gcwq->lock) { - struct idle_rebind idle_rebind; struct worker_pool *pool; - struct worker *worker; + struct worker *worker, *n; struct hlist_node *pos; int i; lockdep_assert_held(&gcwq->lock); for_each_worker_pool(pool, gcwq) - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->assoc_mutex); - /* - * Rebind idle workers. Interlocked both ways. We wait for - * workers to rebind via @idle_rebind.done. Workers will wait for - * us to finish up by watching %WORKER_REBIND. - */ - init_completion(&idle_rebind.done); -retry: - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - - /* set REBIND and kick idle ones, we'll wait for these later */ + /* dequeue and kick idle ones */ for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { - unsigned long worker_flags = worker->flags; - - if (worker->flags & WORKER_REBIND) - continue; - - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; - - idle_rebind.cnt++; - worker->idle_rebind = &idle_rebind; + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list + * until rebind is complete to avoid receiving + * premature local wake-ups. + */ + list_del_init(&worker->entry); - /* worker_thread() will call idle_worker_rebind() */ + /* + * worker_thread() will see the above dequeuing + * and call idle_worker_rebind(). + */ wake_up_process(worker->task); } } - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); - /* busy ones might have become idle while waiting, retry */ - goto retry; - } - - /* all idle workers are rebound, rebind busy workers */ + /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; - unsigned long worker_flags = worker->flags; - - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; + struct workqueue_struct *wq; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) continue; - /* wq doesn't matter, use the default one */ debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - * - * We need to make sure all idle workers passed WORKER_REBIND wait - * in idle_worker_rebind() before returning; otherwise, workers can - * get stuck at the wait if hotplug cycle repeats. - */ - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { - worker->flags &= ~WORKER_REBIND; - idle_rebind.cnt++; - } - } - - wake_up_all(&gcwq->rebind_hold); + /* + * wq doesn't really matter but let's keep @worker->pool + * and @cwq->pool consistent for sanity. + */ + if (worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); + insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } } @@ -1844,22 +2103,22 @@ static bool manage_workers(struct worker *worker) * grab %POOL_MANAGING_WORKERS to achieve this because that can * lead to idle worker depletion (all become busy thinking someone * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->manager_mutex to synchronize + * extreme circumstances. Use @pool->assoc_mutex to synchronize * manager against CPU hotplug. * - * manager_mutex would always be free unless CPU hotplug is in + * assoc_mutex would always be free unless CPU hotplug is in * progress. trylock first without dropping @gcwq->lock. */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { spin_unlock_irq(&pool->gcwq->lock); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->assoc_mutex); /* * CPU hotplug could have happened while we were waiting - * for manager_mutex. Hotplug itself can't handle us + * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and * @gcwq's state and ours could have deviated. * - * As hotplug is now excluded via manager_mutex, we can + * As hotplug is now excluded via assoc_mutex, we can * simply try to bind. It will succeed or fail depending * on @gcwq's current state. Try it and adjust * %WORKER_UNBOUND accordingly. @@ -1882,112 +2141,11 @@ static bool manage_workers(struct worker *worker) ret |= maybe_create_worker(pool); pool->flags &= ~POOL_MANAGING_WORKERS; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); return ret; } /** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); - - trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) -{ - /* ignore uncolored works */ - if (color == WORK_NO_COLOR) - return; - - cwq->nr_in_flight[color]--; - - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - } - - /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) - return; - - /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) - return; - - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; - - /* - * If this was the last cwq, wake up the first flusher. It - * will handle the rest. - */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); -} - -/** * process_one_work - process single work * @worker: self * @work: work to process @@ -2030,7 +2188,7 @@ __acquires(&gcwq->lock) * necessary to avoid spurious warnings from rescuers servicing the * unbound or a disassociated gcwq. */ - WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && !(gcwq->flags & GCWQ_DISASSOCIATED) && raw_smp_processor_id() != gcwq->cpu); @@ -2046,15 +2204,13 @@ __acquires(&gcwq->lock) return; } - /* claim and process */ + /* claim and dequeue */ debug_work_deactivate(work); hlist_add_head(&worker->hentry, bwh); worker->current_work = work; worker->current_cwq = cwq; work_color = get_work_color(work); - /* record the current cpu number in the work data and dequeue */ - set_work_cpu(work, gcwq->cpu); list_del_init(&work->entry); /* @@ -2071,9 +2227,16 @@ __acquires(&gcwq->lock) if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) wake_up_worker(pool); + /* + * Record the last CPU and clear PENDING which should be the last + * update to @work. Also, do this inside @gcwq->lock so that + * PENDING and queued state changes happen together while IRQ is + * disabled. + */ + set_work_cpu_and_clear_pending(work, gcwq->cpu); + spin_unlock_irq(&gcwq->lock); - work_clear_pending(work); lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); @@ -2087,11 +2250,9 @@ __acquires(&gcwq->lock) lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); + pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" + " last function: %pf\n", + current->comm, preempt_count(), task_pid_nr(current), f); debug_show_held_locks(current); dump_stack(); } @@ -2106,7 +2267,7 @@ __acquires(&gcwq->lock) hlist_del_init(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color, false); + cwq_dec_nr_in_flight(cwq, work_color); } /** @@ -2151,18 +2312,17 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&gcwq->lock); - /* - * DIE can be set only while idle and REBIND set while busy has - * @worker->rebind_work scheduled. Checking here is enough. - */ - if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { + /* we are off idle list if destruction or rebind is requested */ + if (unlikely(list_empty(&worker->entry))) { spin_unlock_irq(&gcwq->lock); + /* if DIE is set, destruction is requested */ if (worker->flags & WORKER_DIE) { worker->task->flags &= ~PF_WQ_WORKER; return 0; } + /* otherwise, rebind */ idle_worker_rebind(worker); goto woke_up; } @@ -2257,8 +2417,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded @@ -2645,8 +2807,8 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); goto reflush; } @@ -2657,8 +2819,7 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, - bool wait_executing) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2680,13 +2841,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; - } else if (wait_executing) { + } else { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } else - goto already_gone; + } insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); @@ -2713,15 +2873,8 @@ already_gone: * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. + * Wait until @work has finished execution. @work is guaranteed to be idle + * on return if it hasn't been requeued since flush started. * * RETURNS: * %true if flush_work() waited for the work to finish execution, @@ -2734,140 +2887,36 @@ bool flush_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr, true)) { + if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return true; - } else - return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else + } else { return false; -} - -static bool wait_on_work(struct work_struct *work) -{ - bool ret = false; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - ret |= wait_on_cpu_work(get_gcwq(cpu), work); - return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution. On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished. In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ - struct wq_barrier barr; - bool pending, waited; - - /* we'll wait for executions separately, queue barr only if pending */ - pending = start_flush_work(work, &barr, false); - - /* wait for executions to finish */ - waited = wait_on_work(work); - - /* wait for the pending one */ - if (pending) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - } - - return pending || waited; -} -EXPORT_SYMBOL_GPL(flush_work_sync); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct global_cwq *gcwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - gcwq = get_work_gcwq(work); - if (!gcwq) - return ret; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; - } } - spin_unlock_irq(&gcwq->lock); - - return ret; } +EXPORT_SYMBOL_GPL(flush_work); -static bool __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) +static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + unsigned long flags; int ret; do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); + ret = try_to_grab_pending(work, is_dwork, &flags); + /* + * If someone else is canceling, wait for the same event it + * would be waiting for before retrying. + */ + if (unlikely(ret == -ENOENT)) + flush_work(work); } while (unlikely(ret < 0)); + /* tell other tasks trying to grab @work to back off */ + mark_work_canceling(work); + local_irq_restore(flags); + + flush_work(work); clear_work_data(work); return ret; } @@ -2892,7 +2941,7 @@ static bool __cancel_work_timer(struct work_struct *work, */ bool cancel_work_sync(struct work_struct *work) { - return __cancel_work_timer(work, NULL); + return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -2910,33 +2959,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); */ bool flush_delayed_work(struct delayed_work *dwork) { + local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), + __queue_work(dwork->cpu, get_work_cwq(&dwork->work)->wq, &dwork->work); + local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush + * cancel_delayed_work - cancel a delayed work + * @dwork: delayed_work to cancel * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately. Other than timer handling, its behavior - * is identical to flush_work_sync(). + * Kill off a pending delayed_work. Returns %true if @dwork was pending + * and canceled; %false if wasn't pending. Note that the work callback + * function may still be running on return, unless it returns %true and the + * work doesn't re-arm itself. Explicitly flush or use + * cancel_delayed_work_sync() to wait on it. * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. + * This function is safe to call from any context including IRQ handler. */ -bool flush_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work(struct delayed_work *dwork) { - if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), - get_work_cwq(&dwork->work)->wq, &dwork->work); - return flush_work_sync(&dwork->work); + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + local_irq_restore(flags); + return ret; } -EXPORT_SYMBOL(flush_delayed_work_sync); +EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish @@ -2949,54 +3009,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync); */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { - return __cancel_work_timer(&dwork->work, &dwork->timer); + return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ -int schedule_work_on(int cpu, struct work_struct *work) +bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); /** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution + * schedule_work - put work task in global workqueue + * @work: job to be done * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) +bool schedule_work(struct work_struct *work) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_work(system_wq, work); } -EXPORT_SYMBOL(schedule_delayed_work); +EXPORT_SYMBOL(schedule_work); /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay @@ -3007,14 +3052,28 @@ EXPORT_SYMBOL(schedule_delayed_work); * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) +bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); /** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -3161,9 +3220,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; if (max_active < 1 || max_active > lim) - printk(KERN_WARNING "workqueue: max_active %d requested for %s " - "is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); return clamp_val(max_active, 1, lim); } @@ -3319,6 +3377,26 @@ void destroy_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(destroy_workqueue); /** + * cwq_set_max_active - adjust max_active of a cwq + * @cwq: target cpu_workqueue_struct + * @max_active: new max_active value. + * + * Set @cwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +{ + cwq->max_active = max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); +} + +/** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. @@ -3345,7 +3423,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) - get_cwq(gcwq->cpu, wq)->max_active = max_active; + cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); spin_unlock_irq(&gcwq->lock); } @@ -3409,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work) unsigned int ret = 0; if (!gcwq) - return false; + return 0; spin_lock_irqsave(&gcwq->lock, flags); @@ -3440,23 +3518,23 @@ EXPORT_SYMBOL_GPL(work_busy); */ /* claim manager positions of all pools */ -static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) +static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) { struct worker_pool *pool; for_each_worker_pool(pool, gcwq) - mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); + mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); } /* release manager positions */ -static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; spin_unlock_irq(&gcwq->lock); for_each_worker_pool(pool, gcwq) - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); } static void gcwq_unbind_fn(struct work_struct *work) @@ -3469,7 +3547,7 @@ static void gcwq_unbind_fn(struct work_struct *work) BUG_ON(gcwq->cpu != smp_processor_id()); - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); /* * We've claimed all manager positions. Make all workers unbound @@ -3486,7 +3564,7 @@ static void gcwq_unbind_fn(struct work_struct *work) gcwq->flags |= GCWQ_DISASSOCIATED; - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3514,7 +3592,7 @@ static void gcwq_unbind_fn(struct work_struct *work) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -3542,10 +3620,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; rebind_workers(gcwq); - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); break; } return NOTIFY_OK; @@ -3555,7 +3633,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -3566,7 +3644,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: /* unbinding should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); - schedule_work_on(cpu, &unbind_work); + queue_work_on(cpu, system_highpri_wq, &unbind_work); flush_work(&unbind_work); break; } @@ -3735,11 +3813,7 @@ void thaw_workqueues(void) continue; /* restore max_active and repopulate worklist */ - cwq->max_active = wq->saved_max_active; - - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + cwq_set_max_active(cwq, wq->saved_max_active); } for_each_worker_pool(pool, gcwq) @@ -3759,8 +3833,12 @@ static int __init init_workqueues(void) unsigned int cpu; int i; + /* make sure we have enough bits for OFFQ CPU number */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < + WORK_CPU_LAST); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3786,11 +3864,9 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_mutex); + mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); } - - init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ @@ -3813,17 +3889,14 @@ static int __init init_workqueues(void) } system_wq = alloc_workqueue("events", 0, 0); + system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); - system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", - WQ_NON_REENTRANT | WQ_FREEZABLE, 0); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); + BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); |