diff options
Diffstat (limited to 'kernel')
67 files changed, 1922 insertions, 826 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b95b356..24e7cb0ba26a 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + select PREEMPT_COUNT help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -52,3 +53,5 @@ config PREEMPT endchoice +config PREEMPT_COUNT + bool
\ No newline at end of file diff --git a/kernel/Makefile b/kernel/Makefile index 2d64cfcc8b42..d06467fc8f7c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -125,11 +125,10 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) -quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call if_changed,ikconfiggz) + $(call filechk,ikconfiggz) $(obj)/time.o: $(obj)/timeconst.h diff --git a/kernel/audit.c b/kernel/audit.c index 939500317066..0a1355ca3d79 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -43,7 +43,7 @@ #include <linux/init.h> #include <asm/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> @@ -55,6 +55,9 @@ #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> +#ifdef CONFIG_SECURITY +#include <linux/security.h> +#endif #include <linux/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } +#ifdef CONFIG_SECURITY +/** + * audit_log_secctx - Converts and logs SELinux context + * @ab: audit_buffer + * @secid: security number + * + * This is a helper function that calls security_secid_to_secctx to convert + * secid to secctx and then adds the (converted) SELinux context to the audit + * log by calling audit_log_format, thus also preventing leak of internal secid + * to userspace. If secid cannot be converted audit_panic is called. + */ +void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ + u32 len; + char *secctx; + + if (security_secid_to_secctx(secid, &secctx, &len)) { + audit_panic("Cannot convert secid to context"); + } else { + audit_log_format(ab, " obj=%s", secctx); + security_release_secctx(secctx, len); + } +} +EXPORT_SYMBOL(audit_log_secctx); +#endif + EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e99dda04b126..5bf0790497e7 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree) atomic_inc(&tree->count); } -static void __put_tree(struct rcu_head *rcu) -{ - struct audit_tree *tree = container_of(rcu, struct audit_tree, head); - kfree(tree); -} - static inline void put_tree(struct audit_tree *tree) { if (atomic_dec_and_test(&tree->count)) - call_rcu(&tree->head, __put_tree); + kfree_rcu(tree, head); } /* to avoid bringing the entire thing in audit.h */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 00d79df03e76..ce4b054acee5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -44,7 +44,7 @@ #include <linux/init.h> #include <asm/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/mm.h> diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d725..1d2b6ceea95d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,9 +27,11 @@ */ #include <linux/cgroup.h> +#include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> @@ -59,7 +61,7 @@ #include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_proc */ -#include <asm/atomic.h> +#include <linux/atomic.h> static DEFINE_MUTEX(cgroup_mutex); @@ -1514,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; struct cgroupfs_root *existing_root; + const struct cred *cred; int i; BUG_ON(sb->s_root != NULL); @@ -1593,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); + cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); + revert_creds(cred); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { @@ -1697,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - rcu_read_lock_held() || cgroup_lock_is_held()); if (!dentry || cgrp == dummytop) { @@ -1723,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) break; dentry = rcu_dereference_check(cgrp->dentry, - rcu_read_lock_held() || cgroup_lock_is_held()); if (!cgrp->parent) continue; @@ -3542,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, } /* the process need read permission on control file */ - ret = file_permission(cfile, MAY_READ); + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); if (ret < 0) goto fail; @@ -4813,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, - rcu_read_lock_held() || atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); if (cssid) return cssid->id; @@ -4826,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) { struct css_id *cssid; - cssid = rcu_dereference_check(css->id, - rcu_read_lock_held() || atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); if (cssid) return cssid->depth; diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093acd5..e2435ee9993a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(put_compat_timespec); static long compat_nanosleep_restart(struct restart_block *restart) { @@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); } } +EXPORT_SYMBOL_GPL(sigset_from_compat); asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, @@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat sigset_from_compat(&newset, &newset32); sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&newset); current->state = TASK_INTERRUPTIBLE; schedule(); diff --git a/kernel/configs.c b/kernel/configs.c index b4066b44a99d..42e8fa075eed 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void) module_init(ikconfig_init); module_exit(ikconfig_cleanup); +#endif /* CONFIG_IKCONFIG_PROC */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Randy Dunlap"); MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); - -#endif /* CONFIG_IKCONFIG_PROC */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c810..10131fdaff70 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -55,7 +55,7 @@ #include <linux/sort.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/cgroup.h> @@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) int cpuset_mem_spread_node(void) { + if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) + current->cpuset_mem_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } int cpuset_slab_spread_node(void) { + if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) + current->cpuset_slab_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bad6786dee88..0d7c08784efb 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -51,7 +51,7 @@ #include <asm/cacheflush.h> #include <asm/byteorder.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/system.h> #include "debug_core.h" diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a11db956dd62..34872482315e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -42,6 +42,8 @@ /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; static char remcom_out_buffer[BUFMAX]; +static int gdbstub_use_prev_in_buf; +static int gdbstub_prev_in_buf_pos; /* Storage for the registers, in GDB format. */ static unsigned long gdb_regs[(NUMREGBYTES + @@ -58,6 +60,13 @@ static int gdbstub_read_wait(void) int ret = -1; int i; + if (unlikely(gdbstub_use_prev_in_buf)) { + if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) + return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; + else + gdbstub_use_prev_in_buf = 0; + } + /* poll any additional I/O interfaces that are defined */ while (ret < 0) for (i = 0; kdb_poll_funcs[i] != NULL; i++) { @@ -109,7 +118,6 @@ static void get_packet(char *buffer) buffer[count] = ch; count = count + 1; } - buffer[count] = 0; if (ch == '#') { xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; @@ -124,6 +132,7 @@ static void get_packet(char *buffer) if (dbg_io_ops->flush) dbg_io_ops->flush(); } + buffer[count] = 0; } while (checksum != xmitcsum); } @@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) case 'c': strcpy(remcom_in_buffer, cmd); return 0; - case '?': - gdb_cmd_status(ks); - break; - case '\0': - strcpy(remcom_out_buffer, ""); - break; + case '$': + strcpy(remcom_in_buffer, cmd); + gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); + gdbstub_prev_in_buf_pos = 0; + return 0; } dbg_io_ops->write_char('+'); put_packet(remcom_out_buffer); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 2f62fe85f16a..7179eac7b41c 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv) unsigned long addr; long offset; - kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ - kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each - * proc in bta */ + /* Prompt after each proc in bta */ + kdbgetintenv("BTAPROMPT", &btaprompt); if (strcmp(argv[0], "bta") == 0) { struct task_struct *g, *p; diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds index 56c88e4db309..9834ad303ab6 100644 --- a/kernel/debug/kdb/kdb_cmds +++ b/kernel/debug/kdb/kdb_cmds @@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging" endefcmd defcmd dumpall "" "First line debugging" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -bta endefcmd defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -btc diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index dd0b1b7dd02c..d9ca9aa481ec 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs); int kdb_poll_idx = 1; EXPORT_SYMBOL_GPL(kdb_poll_idx); +static struct kgdb_state *kdb_ks; + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks) kdb_dbtrap_t db_result = KDB_DB_NOBPT; int i; + kdb_ks = ks; if (KDB_STATE(REENTRY)) { reason = KDB_REASON_SWITCH; KDB_STATE_CLEAR(REENTRY); @@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks) KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { - if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { - /* - * This inteface glue which allows kdb to transition in into - * the gdb stub. In order to do this the '?' or '' gdb serial - * packet response is processed here. And then control is - * passed to the gdbstub. - */ - if (KDB_STATE(DOING_KGDB)) - gdbstub_state(ks, "?"); - else - gdbstub_state(ks, ""); + if (KDB_STATE(DOING_KGDB)) KDB_STATE_CLEAR(DOING_KGDB); - KDB_STATE_CLEAR(DOING_KGDB2); - } return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); @@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks) return kgdb_info[ks->cpu].ret_state; } +void kdb_gdb_state_pass(char *buf) +{ + gdbstub_state(kdb_ks, buf); +} diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 96fdaac46a80..4802eb5840e1 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; -static void kgdb_transition_check(char *buffer) +static int kgdb_transition_check(char *buffer) { - int slen = strlen(buffer); - if (strncmp(buffer, "$?#3f", slen) != 0 && - strncmp(buffer, "$qSupported#37", slen) != 0 && - strncmp(buffer, "+$qSupported#37", slen) != 0) { + if (buffer[0] != '+' && buffer[0] != '$') { KDB_STATE_SET(KGDB_TRANS); kdb_printf("%s", buffer); + } else { + int slen = strlen(buffer); + if (slen > 3 && buffer[slen - 3] == '#') { + kdb_gdb_state_pass(buffer); + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB); + return 1; + } } + return 0; } static int kdb_read_get_key(char *buffer, size_t bufsize) @@ -251,6 +257,10 @@ poll_again: case 13: /* enter */ *lastchar++ = '\n'; *lastchar++ = '\0'; + if (!KDB_STATE(KGDB_TRANS)) { + KDB_STATE_SET(KGDB_TRANS); + kdb_printf("%s", buffer); + } kdb_printf("\n"); return buffer; case 4: /* Del */ @@ -382,22 +392,26 @@ poll_again: * printed characters if we think that * kgdb is connecting, until the check * fails */ - if (!KDB_STATE(KGDB_TRANS)) - kgdb_transition_check(buffer); - else + if (!KDB_STATE(KGDB_TRANS)) { + if (kgdb_transition_check(buffer)) + return buffer; + } else { kdb_printf("%c", key); + } } /* Special escape to kgdb */ if (lastchar - buffer >= 5 && strcmp(lastchar - 5, "$?#3f") == 0) { + kdb_gdb_state_pass(lastchar - 5); strcpy(buffer, "kgdb"); KDB_STATE_SET(DOING_KGDB); return buffer; } - if (lastchar - buffer >= 14 && - strcmp(lastchar - 14, "$qSupported#37") == 0) { + if (lastchar - buffer >= 11 && + strcmp(lastchar - 11, "$qSupported") == 0) { + kdb_gdb_state_pass(lastchar - 11); strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB2); + KDB_STATE_SET(DOING_KGDB); return buffer; } } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index be14779bcef6..63786e71a3cd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -145,7 +145,6 @@ static char *__env[] = { #endif "RADIX=16", "MDCOUNT=8", /* lines of md output */ - "BTARGS=9", /* 9 possible args in bt */ KDB_PLATFORM_ENV, "DTABCOUNT=30", "NOSECT=1", @@ -172,6 +171,7 @@ static char *__env[] = { (char *)0, (char *)0, (char *)0, + (char *)0, }; static const int __nenv = (sizeof(__env) / sizeof(char *)); @@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, } if (result == KDB_CMD_KGDB) { - if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) + if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " "or use $D#44+ or $3#33\n"); break; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 35d69ed1dfb5..e381d105b40b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -21,7 +21,6 @@ #define KDB_CMD_SS (-1003) #define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) -#define KDB_CMD_KGDB2 (-1006) /* Internal debug flags */ #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ @@ -146,7 +145,6 @@ extern int kdb_state; * keyboard on this cpu */ #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ -#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch * specific use */ @@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, char *); +extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ #define KDB_SP_SPACEB 0x0001 /* Space before string */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b610aa71..418b3f7053aa 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,8 +19,10 @@ #include <linux/time.h> #include <linux/sysctl.h> #include <linux/delayacct.h> +#include <linux/module.h> int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +EXPORT_SYMBOL_GPL(delayacct_on); struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) diff --git a/kernel/exit.c b/kernel/exit.c index f2b321bae440..2913b3509d42 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) struct tty_struct *uninitialized_var(tty); sighand = rcu_dereference_check(tsk->sighand, - rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); @@ -169,7 +168,6 @@ void release_task(struct task_struct * p) struct task_struct *leader; int zap_leader; repeat: - tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -179,7 +177,7 @@ repeat: proc_flush_task(p); write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); + ptrace_release_task(p); __exit_signal(p); /* @@ -190,22 +188,12 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. */ + zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } @@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void) return retval; } -static int has_stopped_jobs(struct pid *pgrp) +static bool has_stopped_jobs(struct pid *pgrp) { - int retval = 0; struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; + + return false; } /* @@ -751,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, { list_move_tail(&p->sibling, &p->real_parent->children); - if (task_detached(p)) + if (p->exit_state == EXIT_DEAD) return; /* * If this is a threaded reparent there is no need to @@ -764,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ - if (!task_ptrace(p) && + if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { - do_notify_parent(p, p->exit_signal); - if (task_detached(p)) { + if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_move_tail(&p->sibling, dead); } @@ -794,7 +779,7 @@ static void forget_original_parent(struct task_struct *father) do { t->real_parent = reaper; if (t->parent == father) { - BUG_ON(task_ptrace(t)); + BUG_ON(t->ptrace); t->parent = t->real_parent; } if (t->pdeath_signal) @@ -819,8 +804,7 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int signal; - void *cookie; + bool autoreap; /* * This does two things: @@ -851,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - signal = do_notify_parent(tsk, signal); + if (unlikely(tsk->ptrace)) { + int sig = thread_group_leader(tsk) && + thread_group_empty(tsk) && + !ptrace_reparented(tsk) ? + tsk->exit_signal : SIGCHLD; + autoreap = do_notify_parent(tsk, sig); + } else if (thread_group_leader(tsk)) { + autoreap = thread_group_empty(tsk) && + do_notify_parent(tsk, tsk->exit_signal); + } else { + autoreap = true; + } - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - tracehook_report_death(tsk, signal, cookie, group_dead); - /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) + if (autoreap) release_task(tsk); } @@ -906,7 +897,6 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); - WARN_ON(atomic_read(&tsk->fs_excl)); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) @@ -923,7 +913,7 @@ NORET_TYPE void do_exit(long code) */ set_fs(USER_DS); - tracehook_report_exit(&code); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -990,6 +980,7 @@ NORET_TYPE void do_exit(long code) trace_sched_process_exit(tsk); exit_sem(tsk); + exit_shm(tsk); exit_files(tsk); exit_fs(tsk); check_stack_usage(); @@ -1235,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check - * !task_detached() to filter out sub-threads. + * thread_group_leader() to filter out sub-threads. */ - if (likely(!traced) && likely(!task_detached(p))) { + if (likely(!traced) && thread_group_leader(p)) { struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; @@ -1345,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. + * If this is not a sub-thread, notify the parent. + * If parent wants a zombie, don't release it now. */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + if (thread_group_leader(p) && + !do_notify_parent(p, p->exit_signal)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; } write_unlock_irq(&tasklist_lock); } @@ -1367,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p)) + if (task_is_stopped_or_traced(p) && + !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) @@ -1563,7 +1552,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Notification and reaping will be cascaded to the real * parent when the ptracer detaches. */ - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* it will become visible, clear notask_error */ wo->notask_error = 0; return 0; @@ -1606,8 +1595,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && task_ptrace(p) && - same_thread_group(p->parent, p->real_parent)) + if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) return 0; /* diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a0..e7ceaca89609 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,7 +37,6 @@ #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> -#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> @@ -81,7 +80,7 @@ * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ +int nr_threads; /* The idle threads do not count.. */ int max_threads; /* tunable limit on nr_threads */ @@ -233,7 +232,7 @@ void __init fork_init(unsigned long mempages) /* * we need to allow at least 20 threads to boot a system */ - if(max_threads < 20) + if (max_threads < 20) max_threads = 20; init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; @@ -269,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - err = arch_dup_task_struct(tsk, orig); + err = arch_dup_task_struct(tsk, orig); if (err) goto out; @@ -289,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack_canary = get_random_int(); #endif - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); + /* + * One for us, one for whoever does the "release_task()" (usually + * parent) + */ + atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -439,7 +440,7 @@ fail_nomem: goto out; } -static inline int mm_alloc_pgd(struct mm_struct * mm) +static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) @@ -447,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) return 0; } -static inline void mm_free_pgd(struct mm_struct * mm) +static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } @@ -484,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -515,9 +516,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) /* * Allocate and initialize an mm_struct. */ -struct mm_struct * mm_alloc(void) +struct mm_struct *mm_alloc(void) { - struct mm_struct * mm; + struct mm_struct *mm; mm = allocate_mm(); if (!mm) @@ -585,7 +586,7 @@ void added_exe_file_vma(struct mm_struct *mm) void removed_exe_file_vma(struct mm_struct *mm) { mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { fput(mm->exe_file); mm->exe_file = NULL; } @@ -777,9 +778,9 @@ fail_nocontext: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { - struct mm_struct * mm, *oldmm; + struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; @@ -846,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk) { struct files_struct *oldf, *newf; int error = 0; @@ -1013,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&p->pi_waiters, &p->pi_lock); + plist_head_init(&p->pi_waiters); p->pi_blocked_on = NULL; #endif } @@ -1168,13 +1169,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cgroup; + } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = NUMA_NO_NODE; + p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW @@ -1214,25 +1219,33 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; - - if ((retval = audit_alloc(p))) + retval = audit_alloc(p); + if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ - if ((retval = copy_semundo(clone_flags, p))) + retval = copy_semundo(clone_flags, p); + if (retval) goto bad_fork_cleanup_audit; - if ((retval = copy_files(clone_flags, p))) + retval = copy_files(clone_flags, p); + if (retval) goto bad_fork_cleanup_semundo; - if ((retval = copy_fs(clone_flags, p))) + retval = copy_fs(clone_flags, p); + if (retval) goto bad_fork_cleanup_files; - if ((retval = copy_sighand(clone_flags, p))) + retval = copy_sighand(clone_flags, p); + if (retval) goto bad_fork_cleanup_fs; - if ((retval = copy_signal(clone_flags, p))) + retval = copy_signal(clone_flags, p); + if (retval) goto bad_fork_cleanup_sighand; - if ((retval = copy_mm(clone_flags, p))) + retval = copy_mm(clone_flags, p); + if (retval) goto bad_fork_cleanup_signal; - if ((retval = copy_namespaces(clone_flags, p))) + retval = copy_namespaces(clone_flags, p); + if (retval) goto bad_fork_cleanup_mm; - if ((retval = copy_io(clone_flags, p))) + retval = copy_io(clone_flags, p); + if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) @@ -1254,7 +1267,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1322,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). - */ + */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); @@ -1340,7 +1353,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - tracehook_finish_clone(p, clone_flags, trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) @@ -1481,10 +1494,22 @@ long do_fork(unsigned long clone_flags, } /* - * When called from kernel_thread, don't do user tracing stuff. + * Determine whether and which event to report to ptracer. When + * called from kernel_thread or CLONE_UNTRACED is explicitly + * requested, no event is reported; otherwise, report if the event + * for the type of forking is enabled. */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); + if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (clone_flags & CLONE_VFORK) + trace = PTRACE_EVENT_VFORK; + else if ((clone_flags & CSIGNAL) != SIGCHLD) + trace = PTRACE_EVENT_CLONE; + else + trace = PTRACE_EVENT_FORK; + + if (likely(!ptrace_event_enabled(current, trace))) + trace = 0; + } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); @@ -1508,26 +1533,26 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. + * hasn't finished SIGSTOP raising yet. Now we clear it + * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); @@ -1574,6 +1599,7 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); + nsproxy_cache_init(); } /* @@ -1670,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_fs(unshare_flags, &new_fs))) + err = unshare_fs(unshare_flags, &new_fs); + if (err) goto bad_unshare_out; - if ((err = unshare_fd(unshare_flags, &new_fd))) + err = unshare_fd(unshare_flags, &new_fd); + if (err) goto bad_unshare_cleanup_fs; - if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_fs))) + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + if (err) goto bad_unshare_cleanup_fd; if (new_fs || new_fd || do_sysvsem || new_nsproxy) { diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282eae..11cbe052b2e8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page, *page_head; - int err; + int err, ro = 0; /* * The futex address must be "naturally" aligned. @@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) again: err = get_user_pages_fast(address, 1, 1, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == VERIFY_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } if (err < 0) return err; + else + err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE page_head = page; @@ -305,6 +317,13 @@ again: if (!page_head->mapping) { unlock_page(page_head); put_page(page_head); + /* + * ZERO_PAGE pages don't have a mapping. Avoid a busy loop + * trying to find one. RW mapping would have COW'd (and thus + * have a mapping) so this page is RO and won't ever change. + */ + if ((page_head == ZERO_PAGE(address))) + return -EFAULT; goto again; } @@ -316,6 +335,15 @@ again: * the object not the particular process. */ if (PageAnon(page_head)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (ro) { + err = -EFAULT; + goto out; + } + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; @@ -327,9 +355,10 @@ again: get_futex_key_refs(key); +out: unlock_page(page_head); put_page(page_head); - return 0; + return err; } static inline void put_futex_key(union futex_key *key) @@ -355,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; @@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -1243,10 +1272,11 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); if (unlikely(ret != 0)) return ret; @@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, } retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -2060,7 +2090,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -2697,7 +2727,7 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 5bf924d80b5c..a92028196cc1 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1d051b38e0b..5a38bf4de641 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER config GENERIC_IRQ_CHIP bool +# Generic irq_domain hw <--> linux irq number translation +config IRQ_DOMAIN + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 73290056cfb6..fff17381f0af 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -2,6 +2,7 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o +obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa55..bd8e788d71e0 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - free_irq(irq, dev_id); WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, &match_data)); + free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 31a9db711906..3a2cab407b93 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) } /** - * irq_gc_ack - Ack pending interrupt + * irq_gc_ack_set_bit - Ack pending interrupt via setting bit * @d: irq_data */ -void irq_gc_ack(struct irq_data *d) +void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); @@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d) } /** + * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit + * @d: irq_data + */ +void irq_gc_ack_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = ~(1 << (d->irq - gc->irq_base)); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt * @d: irq_data */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c new file mode 100644 index 000000000000..d5828da3fd38 --- /dev/null +++ b/kernel/irq/irqdomain.c @@ -0,0 +1,180 @@ +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/slab.h> + +static LIST_HEAD(irq_domain_list); +static DEFINE_MUTEX(irq_domain_mutex); + +/** + * irq_domain_add() - Register an irq_domain + * @domain: ptr to initialized irq_domain structure + * + * Registers an irq_domain structure. The irq_domain must at a minimum be + * initialized with an ops structure pointer, and either a ->to_irq hook or + * a valid irq_base value. Everything else is optional. + */ +void irq_domain_add(struct irq_domain *domain) +{ + struct irq_data *d; + int hwirq; + + /* + * This assumes that the irq_domain owner has already allocated + * the irq_descs. This block will be removed when support for dynamic + * allocation of irq_descs is added to irq_domain. + */ + for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { + d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + if (d || d->domain) { + /* things are broken; just report, don't clean up */ + WARN(1, "error: irq_desc already assigned to a domain"); + return; + } + d->domain = domain; + d->hwirq = hwirq; + } + + mutex_lock(&irq_domain_mutex); + list_add(&domain->list, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); +} + +/** + * irq_domain_del() - Unregister an irq_domain + * @domain: ptr to registered irq_domain. + */ +void irq_domain_del(struct irq_domain *domain) +{ + struct irq_data *d; + int hwirq; + + mutex_lock(&irq_domain_mutex); + list_del(&domain->list); + mutex_unlock(&irq_domain_mutex); + + /* Clear the irq_domain assignments */ + for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { + d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + d->domain = NULL; + } +} + +#if defined(CONFIG_OF_IRQ) +/** + * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec + * + * Used by the device tree interrupt mapping code to translate a device tree + * interrupt specifier to a valid linux irq number. Returns either a valid + * linux IRQ number or 0. + * + * When the caller no longer need the irq number returned by this function it + * should arrange to call irq_dispose_mapping(). + */ +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *domain; + unsigned long hwirq; + unsigned int irq, type; + int rc = -EINVAL; + + /* Find a domain which can translate the irq spec */ + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, list) { + if (!domain->ops->dt_translate) + continue; + rc = domain->ops->dt_translate(domain, controller, + intspec, intsize, &hwirq, &type); + if (rc == 0) + break; + } + mutex_unlock(&irq_domain_mutex); + + if (rc != 0) + return 0; + + irq = irq_domain_to_irq(domain, hwirq); + if (type != IRQ_TYPE_NONE) + irq_set_irq_type(irq, type); + pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", + controller->full_name, (int)hwirq, irq, type); + return irq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +/** + * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() + * @irq: linux irq number to be discarded + * + * Calling this function indicates the caller no longer needs a reference to + * the linux irq number returned by a prior call to irq_create_of_mapping(). + */ +void irq_dispose_mapping(unsigned int irq) +{ + /* + * nothing yet; will be filled when support for dynamic allocation of + * irq_descs is added to irq_domain + */ +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +int irq_domain_simple_dt_translate(struct irq_domain *d, + struct device_node *controller, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (d->of_node != controller) + return -EINVAL; + if (intsize < 1) + return -EINVAL; + + *out_hwirq = intspec[0]; + *out_type = IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} + +struct irq_domain_ops irq_domain_simple_ops = { + .dt_translate = irq_domain_simple_dt_translate, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +/** + * irq_domain_create_simple() - Set up a 'simple' translation range + */ +void irq_domain_add_simple(struct device_node *controller, int irq_base) +{ + struct irq_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) { + WARN_ON(1); + return; + } + + domain->irq_base = irq_base; + domain->of_node = of_node_get(controller); + domain->ops = &irq_domain_simple_ops; + irq_domain_add(domain); +} +EXPORT_SYMBOL_GPL(irq_domain_add_simple); + +void irq_domain_generate_simple(const struct of_device_id *match, + u64 phys_base, unsigned int irq_start) +{ + struct device_node *node; + pr_info("looking for phys_base=%llx, irq_start=%i\n", + (unsigned long long) phys_base, (int) irq_start); + node = of_find_matching_node_by_address(NULL, match, phys_base); + if (node) + irq_domain_add_simple(node, irq_start); + else + pr_info("no node found\n"); +} +EXPORT_SYMBOL_GPL(irq_domain_generate_simple); +#endif /* CONFIG_OF_IRQ */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 8d814cbc8109..296fbc84d659 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void) size_t size = 0; mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) - size = crashk_res.end - crashk_res.start + 1; + size = resource_size(&crashk_res); mutex_unlock(&kexec_mutex); return size; } diff --git a/kernel/kmod.c b/kernel/kmod.c index 47613dfb7b28..ddc7644c1305 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -274,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work) * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). */ -static int usermodehelper_disabled; +static int usermodehelper_disabled = 1; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 298c9276dfdb..8c24294e477f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); + if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + continue; + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -2478,34 +2481,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_on(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) - return; - - if (unlikely(curr->hardirqs_enabled)) { - /* - * Neither irq nor preemption are disabled here - * so this is racy by nature but losing one hit - * in a stat is not a big deal. - */ - __debug_atomic_inc(redundant_hardirqs_on); - return; - } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) - return; /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -2525,6 +2507,37 @@ void trace_hardirqs_on_caller(unsigned long ip) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); } + +void trace_hardirqs_on_caller(unsigned long ip) +{ + time_hardirqs_on(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (unlikely(current->hardirqs_enabled)) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + return; + + if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + return; + + current->lockdep_recursion = 1; + __trace_hardirqs_on_caller(ip); + current->lockdep_recursion = 0; +} EXPORT_SYMBOL(trace_hardirqs_on_caller); void trace_hardirqs_on(void) @@ -2574,7 +2587,7 @@ void trace_softirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks)) + if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) @@ -2585,6 +2598,7 @@ void trace_softirqs_on(unsigned long ip) return; } + current->lockdep_recursion = 1; /* * We'll do an OFF -> ON transition: */ @@ -2599,6 +2613,7 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, SOFTIRQ); + current->lockdep_recursion = 0; } /* @@ -2608,7 +2623,7 @@ void trace_softirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks)) + if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) @@ -2859,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - int i; - - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) - lock->class_cache[i] = NULL; + memset(lock, 0, sizeof(*lock)); #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); diff --git a/kernel/module.c b/kernel/module.c index 795bdc7f5c3f..04379f92f843 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ mod->field = kstrdup(s, GFP_KERNEL); \ } \ static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ - struct module *mod, char *buffer) \ + struct module_kobject *mk, char *buffer) \ { \ - return sprintf(buffer, "%s\n", mod->field); \ + return sprintf(buffer, "%s\n", mk->mod->field); \ } \ static int modinfo_##field##_exists(struct module *mod) \ { \ @@ -902,9 +902,9 @@ void symbol_put_addr(void *addr) EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, - struct module *mod, char *buffer) + struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mod)); + return sprintf(buffer, "%u\n", module_refcount(mk->mod)); } static struct module_attribute refcnt = { @@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod) #endif /* CONFIG_MODULE_UNLOAD */ static ssize_t show_initstate(struct module_attribute *mattr, - struct module *mod, char *buffer) + struct module_kobject *mk, char *buffer) { const char *state = "unknown"; - switch (mod->state) { + switch (mk->mod->state) { case MODULE_STATE_LIVE: state = "live"; break; @@ -975,10 +975,27 @@ static struct module_attribute initstate = { .show = show_initstate, }; +static ssize_t store_uevent(struct module_attribute *mattr, + struct module_kobject *mk, + const char *buffer, size_t count) +{ + enum kobject_action action; + + if (kobject_action_type(buffer, count, &action) == 0) + kobject_uevent(&mk->kobj, action); + return count; +} + +struct module_attribute module_uevent = { + .attr = { .name = "uevent", .mode = 0200 }, + .store = store_uevent, +}; + static struct module_attribute *modinfo_attrs[] = { &modinfo_version, &modinfo_srcversion, &initstate, + &module_uevent, #ifdef CONFIG_MODULE_UNLOAD &refcnt, #endif @@ -1187,7 +1204,7 @@ struct module_sect_attrs }; static ssize_t module_sect_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); @@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif +void __weak module_free(struct module *mod, void *module_region) +{ + vfree(module_region); +} + +void __weak module_arch_cleanup(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) return ret; } +int __weak apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + pr_err("module %s: REL relocation unsupported\n", me->name); + return -ENOEXEC; +} + +int __weak apply_relocate_add(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + pr_err("module %s: RELA relocation unsupported\n", me->name); + return -ENOEXEC; +} + static int apply_relocations(struct module *mod, const struct load_info *info) { unsigned int i; @@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug) ddebug_remove_module(debug->modname); } +void * __weak module_alloc(unsigned long size) +{ + return size == 0 ? NULL : vmalloc_exec(size); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod) set_fs(old_fs); } +int __weak module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + static struct module *layout_and_allocate(struct load_info *info) { /* Module within temporary copy. */ @@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info) module_free(mod, mod->module_core); } +int __weak module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + return 0; +} + static int post_relocation(struct module *mod, const struct load_info *info) { /* Sort exception table now relocations are done. */ diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb568..8d7b435806c9 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace __kprobes notify_die(enum die_val val, const char *str, diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index d6a00f3de15d..9aeab4b98c64 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -271,10 +271,8 @@ out: return err; } -static int __init nsproxy_cache_init(void) +int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); return 0; } - -module_init(nsproxy_cache_init); diff --git a/kernel/panic.c b/kernel/panic.c index 69231670eb95..d7bb6974efb5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...) } mdelay(PANIC_TIMER_STEP); } + } + if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything * shutting down. But if there is a chance of diff --git a/kernel/params.c b/kernel/params.c index ed72e1330862..22df3e0d142a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -225,8 +225,8 @@ int parse_args(const char *name, int ret; \ \ ret = strtolfn(val, 0, &l); \ - if (ret == -EINVAL || ((type)l != l)) \ - return -EINVAL; \ + if (ret < 0 || ((type)l != l)) \ + return ret < 0 ? ret : -EINVAL; \ *((type *)kp->arg) = l; \ return 0; \ } \ @@ -511,7 +511,7 @@ struct module_param_attrs #define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { int count; struct param_attribute *attribute = to_param_attr(mattr); @@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, - struct module *owner, + struct module_kobject *km, const char *buf, size_t len) { int err; @@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name) mk->kobj.kset = module_kset; err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); +#ifdef CONFIG_MODULES + if (!err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); +#endif if (err) { kobject_put(&mk->kobj); printk(KERN_ERR @@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void) } ssize_t __modver_version_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); @@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - ret = attribute->show(attribute, mk->mod, buf); + ret = attribute->show(attribute, mk, buf); return ret; } @@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - ret = attribute->store(attribute, mk->mod, buf, len); + ret = attribute->store(attribute, mk, buf, len); return ret; } diff --git a/kernel/pid.c b/kernel/pid.c index 57a8346a270e..e432057f3b21 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), - rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 6824ca7d4d0c..37f05d0f0793 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, @@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), .notifiers = &network_lat_notifier, .name = "network_latency", .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, @@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), .notifiers = &network_throughput_notifier, .name = "network_throughput", .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 87f4d24b55b0..b1914cb9095c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -193,8 +193,8 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/pm.txt> and the - Battery Powered Linux mini-HOWTO, available from + and more information, read <file:Documentation/power/apm-acpi.txt> + and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. This driver does not spin down disk drives (see the hdparm(8) @@ -224,6 +224,10 @@ config PM_OPP implementations a ready to use framework to manage OPPs. For more information, read <file:Documentation/power/opp.txt> -config PM_RUNTIME_CLK +config PM_CLK def_bool y - depends on PM_RUNTIME && HAVE_CLK + depends on PM && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM diff --git a/kernel/power/main.c b/kernel/power/main.c index 2981af4ce7cb..6c601f871964 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain(unsigned long val) { - return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) - == NOTIFY_BAD) ? -EINVAL : 0; + int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + + return notifier_to_errno(ret); } /* If set, devices may be suspended and resumed asynchronously. */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ace55889f702..06efa54f93d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1211,7 +1211,11 @@ static void free_unnecessary_pages(void) to_free_highmem = alloc_highmem - save; } else { to_free_highmem = 0; - to_free_normal -= save - alloc_highmem; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; } memory_bm_position_reset(©_bm); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1c41ba215419..b6b71ad2208f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) suspend_ops = ops; mutex_unlock(&pm_mutex); } +EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { @@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state) { return state == PM_SUSPEND_MEM; } +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static int suspend_test(int level) { @@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter + * suspend_enter - enter the desired system sleep state. + * @state: State to enter + * @wakeup: Returns information that suspend should not be entered again. * - * This function should be called after devices have been suspended. + * This function should be called after devices have been suspended. */ -static int suspend_enter(suspend_state_t state) +static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; @@ -165,7 +168,8 @@ static int suspend_enter(suspend_state_t state) error = syscore_suspend(); if (!error) { - if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { error = suspend_ops->enter(state); events_check_enabled = false; } @@ -199,6 +203,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + bool wakeup = false; if (!suspend_ops) return -ENOSYS; @@ -220,7 +225,10 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - error = suspend_enter(state); + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup + && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: suspend_test_start(); diff --git a/kernel/printk.c b/kernel/printk.c index 35185392173f..37dff3429adb 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -782,7 +782,7 @@ static inline int can_use_console(unsigned int cpu) static int console_trylock_for_printk(unsigned int cpu) __releases(&logbuf_lock) { - int retval = 0; + int retval = 0, wake = 0; if (console_trylock()) { retval = 1; @@ -795,12 +795,14 @@ static int console_trylock_for_printk(unsigned int cpu) */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + wake = 1; retval = 0; } } printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); + if (wake) + up(&console_sem); return retval; } static const char recursion_bug_msg [] = @@ -1242,7 +1244,7 @@ void console_unlock(void) { unsigned long flags; unsigned _con_start, _log_end; - unsigned wake_klogd = 0; + unsigned wake_klogd = 0, retry = 0; if (console_suspended) { up(&console_sem); @@ -1251,6 +1253,7 @@ void console_unlock(void) console_may_schedule = 0; +again: for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; @@ -1271,8 +1274,23 @@ void console_unlock(void) if (unlikely(exclusive_console)) exclusive_console = NULL; + spin_unlock(&logbuf_lock); + up(&console_sem); + + /* + * Someone could have filled up the buffer again, so re-check if there's + * something to flush. In case we cannot trylock the console_sem again, + * there's a new owner and the console_unlock() from them will do the + * flush, no worries. + */ + spin_lock(&logbuf_lock); + if (con_start != log_end) + retry = 1; spin_unlock_irqrestore(&logbuf_lock, flags); + if (retry && console_trylock()) + goto again; + if (wake_klogd) wake_up_klogd(); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2df115790cd9..9de3ecfd20f9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -23,8 +23,15 @@ #include <linux/uaccess.h> #include <linux/regset.h> #include <linux/hw_breakpoint.h> +#include <linux/cn_proc.h> +static int ptrace_trapping_sleep_fn(void *flags) +{ + schedule(); + return 0; +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child) spin_lock(&child->sighand->siglock); /* - * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * Clear all pending traps and TRAPPING. TRAPPING should be + * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. + */ + task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); + task_clear_jobctl_trapping(child); + + /* + * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count)) - child->group_stop |= GROUP_STOP_PENDING; + child->jobctl |= JOBCTL_STOP_PENDING; /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick @@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child) * is in TASK_TRACED; otherwise, we might unduly disrupt * TASK_KILLABLE sleeps. */ - if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) signal_wake_up(child, task_is_traced(child)); spin_unlock(&child->sighand->siglock); } -/* - * Check that we have indeed attached to the thing.. +/** + * ptrace_check_attach - check whether ptracee is ready for ptrace operation + * @child: ptracee to check for + * @ignore_state: don't check whether @child is currently %TASK_TRACED + * + * Check whether @child is being ptraced by %current and ready for further + * ptrace operations. If @ignore_state is %false, @child also should be in + * %TASK_TRACED state and on return the child is guaranteed to be traced + * and not executing. If @ignore_state is %true, @child can be in any + * state. + * + * CONTEXT: + * Grabs and releases tasklist_lock and @child->sighand->siglock. + * + * RETURNS: + * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, int kill) +int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ spin_lock_irq(&child->sighand->siglock); WARN_ON_ONCE(task_is_stopped(child)); - if (task_is_traced(child) || kill) + if (ignore_state || (task_is_traced(child) && + !(child->jobctl & JOBCTL_LISTENING))) ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !kill) + if (!ret && !ignore_state) ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ @@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -static int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task, long request, + unsigned long flags) { - bool wait_trap = false; + bool seize = (request == PTRACE_SEIZE); int retval; + /* + * SEIZE will enable new ptrace behaviors which will be implemented + * gradually. SEIZE_DEVEL is used to prevent applications + * expecting full SEIZE behaviors trapping on kernel commits which + * are still in the process of implementing them. + * + * Only test programs for new ptrace behaviors being implemented + * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. + * + * Once SEIZE behaviors are completely implemented, this flag and + * the following test will be removed. + */ + retval = -EIO; + if (seize && !(flags & PTRACE_SEIZE_DEVEL)) + goto out; + audit_ptrace(task); retval = -EPERM; @@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; + if (seize) + task->ptrace |= PT_SEIZED; if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); spin_lock(&task->sighand->siglock); /* - * If the task is already STOPPED, set GROUP_STOP_PENDING and + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait @@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task) * The following task_is_stopped() test is safe as both transitions * in and out of STOPPED are protected by siglock. */ - if (task_is_stopped(task)) { - task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + if (task_is_stopped(task) && + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) signal_wake_up(task, 1); - wait_trap = true; - } spin_unlock(&task->sighand->siglock); @@ -257,9 +306,12 @@ unlock_tasklist: unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: - if (wait_trap) - wait_event(current->signal->wait_chldexit, - !(task->group_stop & GROUP_STOP_TRAPPING)); + if (!retval) { + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, + ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + proc_ptrace_connector(task, PTRACE_ATTACH); + } + return retval; } @@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh) */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { + bool dead; + __ptrace_unlink(p); - if (p->exit_state == EXIT_ZOMBIE) { - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); - p->exit_signal = -1; - } - } - if (task_detached(p)) { - /* Mark it as in the process of being reaped. */ - p->exit_state = EXIT_DEAD; - return true; + if (p->exit_state != EXIT_ZOMBIE) + return false; + + dead = !thread_group_leader(p); + + if (!dead && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + dead = do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); + dead = true; } } - - return false; + /* Mark it as in the process of being reaped. */ + if (dead) + p->exit_state = EXIT_DEAD; + return dead; } static int ptrace_detach(struct task_struct *child, unsigned int data) @@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) } write_unlock_irq(&tasklist_lock); + proc_ptrace_connector(child, PTRACE_DETACH); if (unlikely(dead)) release_task(child); @@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; - siginfo_t siginfo; + siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; + unsigned long flags; switch (request) { case PTRACE_PEEKTEXT: @@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_INTERRUPT: + /* + * Stop tracee without any side-effect on signal or job + * control. At least one trap is guaranteed to happen + * after this request. If @child is already trapped, the + * current trap is not disturbed and another trap will + * happen after the current trap is ended with PTRACE_CONT. + * + * The actual trap might not be PTRACE_EVENT_STOP trap but + * the pending condition is cleared regardless. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + /* + * INTERRUPT doesn't disturb existing trap sans one + * exception. If ptracer issued LISTEN for the current + * STOP, this INTERRUPT should clear LISTEN and re-trap + * tracee into STOP. + */ + if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) + signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + + case PTRACE_LISTEN: + /* + * Listen for events. Tracee must be in STOP. It's not + * resumed per-se but is not considered to be in TRACED by + * wait(2) or ptrace(2). If an async event (e.g. group + * stop state change) happens, tracee will enter STOP trap + * again. Alternatively, ptracer can issue INTERRUPT to + * finish listening and re-trap tracee into STOP. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + si = child->last_siginfo; + if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) + break; + + child->jobctl |= JOBCTL_LISTENING; + + /* + * If NOTIFY is set, it means event happened between start + * of this trap and now. Trigger re-trap immediately. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (ret < 0) goto out_put_task_struct; @@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (!ret) ret = compat_arch_ptrace(child, request, addr, data); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 7784bd216b6a..ddddb320be61 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -37,7 +37,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/sched.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> #include <linux/notifier.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e138db03382..98f51b13bb7e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -33,7 +33,7 @@ #include <linux/rcupdate.h> #include <linux/interrupt.h> #include <linux/sched.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/completion.h> #include <linux/moduleparam.h> @@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused) idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_held() || rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); @@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg) idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_held() || rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7e59ffb3d0ba..ba06207b1dd3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,9 +84,32 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +/* + * The rcu_scheduler_active variable transitions from zero to one just + * before the first task is spawned. So when this variable is zero, RCU + * can assume that there is but one task, allowing RCU to (for example) + * optimized synchronize_sched() to a simple barrier(). When this variable + * is one, RCU must actually do all the hard work required to detect real + * grace periods. This variable is also used to suppress boot-time false + * positives from lockdep-RCU error checking. + */ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +/* + * The rcu_scheduler_fully_active variable transitions from zero to one + * during the early_initcall() processing, which is after the scheduler + * is capable of creating new tasks. So RCU processing (for example, + * creating tasks for RCU priority boosting) must be delayed until after + * rcu_scheduler_fully_active transitions from zero to one. We also + * currently delay invocation of any RCU callbacks until after this point. + * + * It might later prove better for people registering RCU callbacks during + * early boot to take responsibility for these callbacks, but one step at + * a time. + */ +static int rcu_scheduler_fully_active __read_mostly; + #ifdef CONFIG_RCU_BOOST /* @@ -98,7 +121,6 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); -static char rcu_kthreads_spawnable; #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1467,6 +1489,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { + if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) + return; if (likely(!rsp->boost)) { rcu_do_batch(rsp, rdp); return; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 14dc7dd00902..8aafbb80b8b0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_data *rdp; struct rcu_node *rnp; - if (t->rcu_read_lock_nesting && + if (t->rcu_read_lock_nesting > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu) rnp->gp_tasks = &t->rcu_node_entry; } raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; + /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ + if (t->rcu_boosted) { + special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 0; + } #endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; @@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; rt_mutex_unlock(t->rcu_boost_mutex); t->rcu_boost_mutex = NULL; } @@ -387,13 +400,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu) rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } @@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; + } if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&sync_rcu_preempt_exp_wq); break; } @@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -1532,7 +1557,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) struct sched_param sp; struct task_struct *t; - if (!rcu_kthreads_spawnable || + if (!rcu_scheduler_fully_active || per_cpu(rcu_cpu_kthread_task, cpu) != NULL) return 0; t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); @@ -1639,7 +1664,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, struct sched_param sp; struct task_struct *t; - if (!rcu_kthreads_spawnable || + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) return 0; if (rnp->node_kthread_task == NULL) { @@ -1665,7 +1690,7 @@ static int __init rcu_spawn_kthreads(void) int cpu; struct rcu_node *rnp; - rcu_kthreads_spawnable = 1; + rcu_scheduler_fully_active = 1; for_each_possible_cpu(cpu) { per_cpu(rcu_cpu_has_work, cpu) = 0; if (cpu_online(cpu)) @@ -1687,7 +1712,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_kthreads_spawnable) { + if (rcu_scheduler_fully_active) { (void)rcu_spawn_one_cpu_kthread(cpu); if (rnp->node_kthread_task == NULL) (void)rcu_spawn_one_node_kthread(rcu_state, rnp); @@ -1726,6 +1751,13 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt) { } +static int __init rcu_scheduler_really_started(void) +{ + rcu_scheduler_fully_active = 1; + return 0; +} +early_initcall(rcu_scheduler_really_started); + static void __cpuinit rcu_prepare_kthreads(int cpu) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4e144876dc68..3b0c0986afc0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -31,7 +31,7 @@ #include <linux/rcupdate.h> #include <linux/interrupt.h> #include <linux/sched.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/module.h> #include <linux/completion.h> diff --git a/kernel/resource.c b/kernel/resource.c index 798e2fae2a06..3b3cedc52592 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,6 +38,14 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); +/* constraints to be met while allocating resources */ +struct resource_constraint { + resource_size_t min, max, align; + resource_size_t (*alignf)(void *, const struct resource *, + resource_size_t, resource_size_t); + void *alignf_data; +}; + static DEFINE_RWLOCK(resource_lock); static void *r_next(struct seq_file *m, void *v, loff_t *pos) @@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2) } /* - * Find empty slot in the resource tree given range and alignment. + * Find empty slot in the resource tree with the given range and + * alignment constraints */ -static int find_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), - void *alignf_data) +static int __find_resource(struct resource *root, struct resource *old, + struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) { struct resource *this = root->child; struct resource tmp = *new, avail, alloc; @@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new, * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to tmp->end below would cause an underflow. */ - if (this && this->start == 0) { - tmp.start = this->end + 1; + if (this && this->start == root->start) { + tmp.start = (this == old) ? old->start : this->end + 1; this = this->sibling; } for(;;) { if (this) - tmp.end = this->start - 1; + tmp.end = (this == old) ? this->end : this->start - 1; else tmp.end = root->end; - resource_clip(&tmp, min, max); + resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ avail = *new; - avail.start = ALIGN(tmp.start, align); + avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; if (avail.start >= tmp.start) { - alloc.start = alignf(alignf_data, &avail, size, align); + alloc.start = constraint->alignf(constraint->alignf_data, &avail, + size, constraint->align); alloc.end = alloc.start + size - 1; if (resource_contains(&avail, &alloc)) { new->start = alloc.start; @@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new, } if (!this) break; - tmp.start = this->end + 1; + if (this != old) + tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; } +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) +{ + return __find_resource(root, NULL, new, size, constraint); +} + /** - * allocate_resource - allocate empty slot in the resource tree given range & alignment + * reallocate_resource - allocate a slot in the resource tree given range & alignment. + * The resource will be relocated if the new size cannot be reallocated in the + * current location. + * + * @root: root resource descriptor + * @old: resource descriptor desired by caller + * @newsize: new size of the resource descriptor + * @constraint: the size and alignment constraints to be met. + */ +int reallocate_resource(struct resource *root, struct resource *old, + resource_size_t newsize, + struct resource_constraint *constraint) +{ + int err=0; + struct resource new = *old; + struct resource *conflict; + + write_lock(&resource_lock); + + if ((err = __find_resource(root, old, &new, newsize, constraint))) + goto out; + + if (resource_contains(&new, old)) { + old->start = new.start; + old->end = new.end; + goto out; + } + + if (old->child) { + err = -EBUSY; + goto out; + } + + if (resource_contains(old, &new)) { + old->start = new.start; + old->end = new.end; + } else { + __release_resource(old); + *old = new; + conflict = __request_resource(root, old); + BUG_ON(conflict); + } +out: + write_unlock(&resource_lock); + return err; +} + + +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment. + * The resource will be reallocated with a new size if it was already allocated * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size @@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new, void *alignf_data) { int err; + struct resource_constraint constraint; if (!alignf) alignf = simple_align_resource; + constraint.min = min; + constraint.max = max; + constraint.align = align; + constraint.alignf = alignf; + constraint.alignf_data = alignf_data; + + if ( new->parent ) { + /* resource is already allocated, try reallocating with + the new constraints */ + return reallocate_resource(root, new, size, &constraint); + } + write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + err = find_resource(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); @@ -473,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); +/** + * lookup_resource - find an existing resource by a resource start address + * @root: root resource descriptor + * @start: resource start address + * + * Returns a pointer to the resource if found, NULL otherwise + */ +struct resource *lookup_resource(struct resource *root, resource_size_t start) +{ + struct resource *res; + + read_lock(&resource_lock); + for (res = root->child; res; res = res->sibling) { + if (res->start == start) + break; + } + read_unlock(&resource_lock); + + return res; +} + /* * Insert a resource into the resource tree. If successful, return NULL, * otherwise return the conflicting resource (compare to __request_resource()) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index ab449117aaf2..255e1662acdb 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init_raw(&lock->wait_list, &lock->wait_lock); + plist_head_init(&lock->wait_list); debug_rt_mutex_init(lock, name); } diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b05f5e..9f48f3d82e9b 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -11,7 +11,7 @@ #include <linux/rwsem.h> #include <asm/system.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * lock for reading @@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - #endif diff --git a/kernel/sched.c b/kernel/sched.c index d08d110b8976..ccacdbdecf45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,9 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> #include <asm/mutex.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #include "sched_cpupri.h" #include "workqueue_sched.h" @@ -124,7 +127,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } @@ -292,8 +295,8 @@ static DEFINE_SPINLOCK(task_group_lock); * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -422,6 +425,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +435,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; @@ -528,6 +531,12 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr) } EXPORT_SYMBOL_GPL(account_system_vtime); -static void update_rq_clock_task(struct rq *rq, s64 delta) +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) { - s64 irq_delta; + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* @@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; - if (irq_delta && sched_feat(NONIRQ_POWER)) - sched_rt_avg_update(rq, irq_delta); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void) #define sched_clock_irqtime (0) -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ - rq->clock_task += delta; -} - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif #include "sched_idletask.c" #include "sched_fair.c" @@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; @@ -2544,13 +2556,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2563,9 +2571,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) @@ -2854,7 +2898,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif @@ -3845,6 +3889,25 @@ void account_idle_time(cputime_t cputime) cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_branch(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3876,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + if (steal_account_process_tick()) + return; + if (irqtime_account_hi_update()) { cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { @@ -3929,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } + if (steal_account_process_tick()) + return; + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -4306,11 +4375,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif @@ -6557,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6774,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6964,15 +7055,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6981,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -7006,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7020,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7037,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7051,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); + + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7177,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - } + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } #ifdef CONFIG_SCHED_SMT @@ -7210,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7234,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7251,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7266,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } @@ -7316,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } while (sd->child) sd = sd->child; @@ -7329,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } @@ -7745,18 +7928,14 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) @@ -7772,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks); #endif rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7801,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7828,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -7915,7 +8093,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -7986,7 +8164,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init(&init_task.pi_waiters); #endif /* @@ -8029,7 +8207,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); @@ -8039,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8061,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif @@ -8220,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8247,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } @@ -8268,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) @@ -8309,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } @@ -8450,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); mutex_lock(&shares_mutex); if (tg->shares == shares) diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 05577055cfca..c2f0e7248dca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -13,6 +13,7 @@ struct autogroup { int nice; }; +static inline bool task_group_is_autogroup(struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8f..bc8ee9993814 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } @@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } -static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return se->vruntime - cfs_rq->min_vruntime; -} - static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; @@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* @@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * We dont care about collisions. Nodes with * the same key stay together. */ - if (key < entity_key(cfs_rq, entry)) { + if (entity_before(se, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; @@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); @@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ if (task_sleep && parent_entity(se)) set_next_buddy(parent_entity(se)); + + /* avoid re-evaluating load for this entity */ + se = parent_entity(se); break; } flags |= DEQUEUE_SLEEP; } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); @@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ - rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; - rcu_read_unlock(); /* * If the currently running task will sleep within @@ -1585,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; if (local_group) { this_load = avg_load; @@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (!sched_feat(WAKEUP_PREEMPT)) return; - update_curr(cfs_rq); find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { /* @@ -2231,11 +2213,43 @@ static void update_shares(int cpu) struct rq *rq = cpu_rq(cpu); rcu_read_lock(); + /* + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ for_each_leaf_cfs_rq(rq, cfs_rq) update_shares_cpu(cfs_rq->tg, cpu); rcu_read_unlock(); } +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->se[cpu]->load.weight; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; + struct cfs_rq *busiest_cfs_rq; rcu_read_lock(); - update_h_load(busiest_cpu); + update_h_load(cpu_of(busiest)); - list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; @@ -2631,7 +2643,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->cpu_power_orig = power; + sdg->sgp->power_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -2647,7 +2659,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->cpu_power = power; + sdg->sgp->power = power; } static void update_group_power(struct sched_domain *sd, int cpu) @@ -2665,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->cpu_power; + power += group->sgp->power; group = group->next; } while (group != child->groups); - sdg->cpu_power = power; + sdg->sgp->power = power; } /* @@ -2691,7 +2703,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * If ~90% of the cpu_power is still there, we're good. */ - if (group->cpu_power * 32 > group->cpu_power_orig * 29) + if (group->sgp->power * 32 > group->sgp->power_orig * 29) return 1; return 0; @@ -2771,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; /* * Consider the group unbalanced when the imbalance is larger @@ -2788,7 +2800,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); @@ -2877,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += sg->cpu_power; + sds->total_pwr += sg->sgp->power; /* * In case the child domain prefers tasks go to siblings @@ -2962,7 +2974,7 @@ static int check_asym_packing(struct sched_domain *sd, if (this_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -2993,7 +3005,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->cpu_power; + scaled_busy_load_per_task /= sds->busiest->sgp->power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -3007,28 +3019,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->cpu_power * + pwr_now += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->cpu_power * + pwr_now += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->cpu_power; + sds->busiest->sgp->power; if (sds->max_load > tmp) - pwr_move += sds->busiest->cpu_power * + pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->cpu_power < + if (sds->max_load * sds->busiest->sgp->power < sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->cpu_power) / - sds->this->cpu_power; + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; else tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->cpu_power; - pwr_move += sds->this->cpu_power * + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -3074,7 +3086,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->cpu_power; + load_above_capacity /= sds->busiest->sgp->power; } /* @@ -3090,8 +3102,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->cpu_power, - (sds->avg_load - sds->this_load) * sds->this->cpu_power) + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee1..2e74677cb040 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,12 +61,14 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(OWNER_SPIN, 1) /* - * Decrement CPU power based on irq activity + * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONIRQ_POWER, 1) +SCHED_FEAT(NONTASK_POWER, 1) /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, 1) + +SCHED_FEAT(FORCE_SD_OVERLAP, 0) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 10d018212bab..97540f0c9e47 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) typedef struct task_group *rt_rq_iter_t; -#define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ - (&iter->list != &task_groups) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]); \ - iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) +static inline struct task_group *next_task_group(struct task_group *tg) +{ + do { + tg = list_entry_rcu(tg->list.next, + typeof(struct task_group), list); + } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + + if (&tg->list == &task_groups) + tg = NULL; + + return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = container_of(&task_groups, typeof(*iter), list); \ + (iter = next_task_group(iter)) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]);) static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { @@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) rt_rq = &rq->rt; - if (unlikely(!rt_rq->rt_nr_running)) + if (!rt_rq->rt_nr_running) return NULL; if (rt_rq_throttled(rt_rq)) @@ -1548,7 +1560,7 @@ skip: static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + if (rq->rt.highest_prio.curr > prev->prio) pull_rt_task(rq); } diff --git a/kernel/signal.c b/kernel/signal.c index ff7678603328..291c9700be75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) /* * Tracers may want to know about even ignored signals. */ - return !tracehook_consider_ignored_signal(t, sig); + return !t->ptrace; } /* @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->group_stop & GROUP_STOP_PENDING) || + if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (unlikely(tracehook_force_sigpending())) - set_thread_flag(TIF_SIGPENDING); - else if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig) } /** - * task_clear_group_stop_trapping - clear group stop trapping bit + * task_set_jobctl_pending - set jobctl pending bits * @task: target task + * @mask: pending bits to set * - * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it - * and wake up the ptracer. Note that we don't need any further locking. - * @task->siglock guarantees that @task->parent points to the ptracer. + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | + * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is + * cleared. If @task is already being killed or exiting, this function + * becomes noop. * * CONTEXT: * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if @mask is set, %false if made noop because @task was dying. */ -static void task_clear_group_stop_trapping(struct task_struct *task) +bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) { - if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { - task->group_stop &= ~GROUP_STOP_TRAPPING; - __wake_up_sync_key(&task->parent->signal->wait_chldexit, - TASK_UNINTERRUPTIBLE, 1, task); + BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | + JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); + BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); + + if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) + return false; + + if (mask & JOBCTL_STOP_SIGMASK) + task->jobctl &= ~JOBCTL_STOP_SIGMASK; + + task->jobctl |= mask; + return true; +} + +/** + * task_clear_jobctl_trapping - clear jobctl trapping bit + * @task: target task + * + * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. + * Clear it and wake up the ptracer. Note that we don't need any further + * locking. @task->siglock guarantees that @task->parent points to the + * ptracer. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +void task_clear_jobctl_trapping(struct task_struct *task) +{ + if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_TRAPPING; + wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** - * task_clear_group_stop_pending - clear pending group stop + * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task + * @mask: pending bits to clear + * + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other + * STOP bits are cleared together. * - * Clear group stop states for @task. + * If clearing of @mask leaves no stop or trap pending, this function calls + * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_group_stop_pending(struct task_struct *task) +void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) { - task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | - GROUP_STOP_DEQUEUED); + BUG_ON(mask & ~JOBCTL_PENDING_MASK); + + if (mask & JOBCTL_STOP_PENDING) + mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; + + task->jobctl &= ~mask; + + if (!(task->jobctl & JOBCTL_PENDING_MASK)) + task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * - * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if - * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate %SIGNAL_* flags are set. * * CONTEXT: @@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task) static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; - bool consume = task->group_stop & GROUP_STOP_CONSUME; + bool consume = task->jobctl & JOBCTL_STOP_CONSUME; - WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); - task_clear_group_stop_pending(task); + task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; @@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return 1; if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return !tracehook_consider_fatal_signal(tsk, sig); + /* if ptraced, let the tracer determine */ + return !tsk->ptrace; } /* @@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - current->group_stop |= GROUP_STOP_DEQUEUED; + current->jobctl |= JOBCTL_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } +/** + * ptrace_trap_notify - schedule trap to notify ptracer + * @t: tracee wanting to notify tracer + * + * This function schedules sticky ptrace trap which is cleared on the next + * TRAP_STOP to notify ptracer of an event. @t must have been seized by + * ptracer. + * + * If @t is running, STOP trap will be taken. If trapped for STOP and + * ptracer is listening for events, tracee is woken up so that it can + * re-trap for the new event. If trapped otherwise, STOP trap will be + * eventually taken without returning to userland after the existing traps + * are finished by PTRACE_CONT. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void ptrace_trap_notify(struct task_struct *t) +{ + WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); + assert_spin_locked(&t->sighand->siglock); + + task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); + signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); +} + /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - wake_up_state(t, __TASK_STOPPED); + if (likely(!(t->ptrace & PT_SEIZED))) + wake_up_state(t, __TASK_STOPPED); + else + ptrace_trap_notify(t); } while_each_thread(p, t); /* @@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || - !tracehook_consider_fatal_signal(t, sig))) { + (sig == SIGKILL || !t->ptrace)) { /* * This signal will be fatal to the whole group. */ @@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ @@ -1178,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; - rcu_read_lock(); for (;;) { + local_irq_save(*flags); + rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); + local_irq_restore(*flags); break; + } - spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) + spin_lock(&sighand->siglock); + if (likely(sighand == tsk->sighand)) { + rcu_read_unlock(); break; - spin_unlock_irqrestore(&sighand->siglock, *flags); + } + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + local_irq_restore(*flags); } - rcu_read_unlock(); return sighand; } @@ -1504,22 +1584,22 @@ ret: * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * - * Returns -1 if our parent ignored us and so we've switched to - * self-reaping, or else @sig. + * Returns true if our parent ignored us and so we've switched to + * self-reaping. */ -int do_notify_parent(struct task_struct *tsk, int sig) +bool do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; struct sighand_struct *psig; - int ret = sig; + bool autoreap = false; BUG_ON(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ BUG_ON(task_is_stopped_or_traced(tsk)); - BUG_ON(!task_ptrace(tsk) && + BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); info.si_signo = sig; @@ -1558,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); - if (!task_ptrace(tsk) && sig == SIGCHLD && + if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* @@ -1576,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - ret = tsk->exit_signal = -1; + autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = -1; + sig = 0; } - if (valid_signal(sig) && sig > 0) + if (valid_signal(sig) && sig) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); - return ret; + return autoreap; } /** @@ -1658,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, static inline int may_ptrace_stop(void) { - if (!likely(task_ptrace(current))) + if (!likely(current->ptrace)) return 0; /* * Are we in the middle of do_coredump? @@ -1687,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk) } /* - * Test whether the target task of the usual cldstop notification - the - * real_parent of @child - is in the same group as the ptracer. - */ -static bool real_parent_is_ptracer(struct task_struct *child) -{ - return same_thread_group(child->parent, child->real_parent); -} - -/* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. @@ -1732,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) } /* - * If @why is CLD_STOPPED, we're trapping to participate in a group - * stop. Do the bookkeeping. Note that if SIGCONT was delievered - * while siglock was released for the arch hook, PENDING could be - * clear now. We act as if SIGCONT is received after TASK_TRACED - * is entered - ignore it. + * We're committing to trapping. TRACED should be visible before + * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). + * Also, transition to TRACED and updates to ->jobctl should be + * atomic with respect to siglock and should be done after the arch + * hook as siglock is released and regrabbed across it. */ - if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) - gstop_done = task_participate_group_stop(current); + set_current_state(TASK_TRACED); current->last_siginfo = info; current->exit_code = exit_code; /* - * TRACED should be visible before TRAPPING is cleared; otherwise, - * the tracer might fail do_wait(). + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * across siglock relocks since INTERRUPT was scheduled, PENDING + * could be clear now. We act as if SIGCONT is received after + * TASK_TRACED is entered - ignore it. */ - set_current_state(TASK_TRACED); + if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) + gstop_done = task_participate_group_stop(current); - /* - * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and - * transition to TASK_TRACED should be atomic with respect to - * siglock. This hsould be done after the arch hook as siglock is - * released and regrabbed across it. - */ - task_clear_group_stop_trapping(current); + /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ + task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); + if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) + task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); + + /* entering a trap, clear TRAPPING */ + task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); @@ -1772,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * separately unless they're gonna be duplicates. */ do_notify_parent_cldstop(current, true, why); - if (gstop_done && !real_parent_is_ptracer(current)) + if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); /* @@ -1792,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * * If @gstop_done, the ptracer went away between group stop * completion and here. During detach, it would have set - * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED - * in do_signal_stop() on return, so notifying the real - * parent of the group stop completion is enough. + * JOBCTL_STOP_PENDING on us and we'll re-enter + * TASK_STOPPED in do_signal_stop() on return, so notifying + * the real parent of the group stop completion is enough. */ if (gstop_done) do_notify_parent_cldstop(current, false, why); @@ -1820,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_lock_irq(¤t->sighand->siglock); current->last_siginfo = NULL; + /* LISTENING can be set only during STOP traps, clear it */ + current->jobctl &= ~JOBCTL_LISTENING; + /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. @@ -1828,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) recalc_sigpending_tsk(current); } -void ptrace_notify(int exit_code) +static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - memset(&info, 0, sizeof info); - info.si_signo = SIGTRAP; + info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = current_uid(); /* Let the debugger run. */ + ptrace_stop(exit_code, why, 1, &info); +} + +void ptrace_notify(int exit_code) +{ + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); + ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); spin_unlock_irq(¤t->sighand->siglock); } -/* - * This performs the stopping for SIGSTOP and other stop signals. - * We have to stop all threads in the thread group. - * Returns non-zero if we've actually stopped and released the siglock. - * Returns zero if we didn't stop and still hold the siglock. +/** + * do_signal_stop - handle group stop for SIGSTOP and other stop signals + * @signr: signr causing group stop if initiating + * + * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr + * and participate in it. If already set, participate in the existing + * group stop. If participated in a group stop (and thus slept), %true is + * returned with siglock released. + * + * If ptraced, this function doesn't handle stop itself. Instead, + * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock + * untouched. The caller must ensure that INTERRUPT trap handling takes + * places afterwards. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which is released + * on %true return. + * + * RETURNS: + * %false if group stop is already cancelled or ptrace trap is scheduled. + * %true if participated in group stop. */ -static int do_signal_stop(int signr) +static bool do_signal_stop(int signr) + __releases(¤t->sighand->siglock) { struct signal_struct *sig = current->signal; - if (!(current->group_stop & GROUP_STOP_PENDING)) { - unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; + if (!(current->jobctl & JOBCTL_STOP_PENDING)) { + unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; - /* signr will be recorded in task->group_stop for retries */ - WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + /* signr will be recorded in task->jobctl for retries */ + WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); - if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || + if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) - return 0; + return false; /* * There is no group stop already in progress. We must * initiate one now. @@ -1888,28 +1987,32 @@ static int do_signal_stop(int signr) if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; else - WARN_ON_ONCE(!task_ptrace(current)); + WARN_ON_ONCE(!current->ptrace); + + sig->group_stop_count = 0; + + if (task_set_jobctl_pending(current, signr | gstop)) + sig->group_stop_count++; - current->group_stop &= ~GROUP_STOP_SIGMASK; - current->group_stop |= signr | gstop; - sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) { - t->group_stop &= ~GROUP_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { - t->group_stop |= signr | gstop; + if (!task_is_stopped(t) && + task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; - signal_wake_up(t, 0); + if (likely(!(t->ptrace & PT_SEIZED))) + signal_wake_up(t, 0); + else + ptrace_trap_notify(t); } } } -retry: - if (likely(!task_ptrace(current))) { + + if (likely(!current->ptrace)) { int notify = 0; /* @@ -1940,43 +2043,65 @@ retry: /* Now we don't run again until woken by SIGCONT or SIGKILL */ schedule(); - - spin_lock_irq(¤t->sighand->siglock); + return true; } else { - ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, - CLD_STOPPED, 0, NULL); - current->exit_code = 0; + /* + * While ptraced, group stop is handled by STOP trap. + * Schedule it and let the caller deal with it. + */ + task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); + return false; } +} - /* - * GROUP_STOP_PENDING could be set if another group stop has - * started since being woken up or ptrace wants us to transit - * between TASK_STOPPED and TRACED. Retry group stop. - */ - if (current->group_stop & GROUP_STOP_PENDING) { - WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); - goto retry; +/** + * do_jobctl_trap - take care of ptrace jobctl traps + * + * When PT_SEIZED, it's used for both group stop and explicit + * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with + * accompanying siginfo. If stopped, lower eight bits of exit_code contain + * the stop signal; otherwise, %SIGTRAP. + * + * When !PT_SEIZED, it's used only for group stop trap with stop signal + * number as exit_code and no siginfo. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which may be + * released and re-acquired before returning with intervening sleep. + */ +static void do_jobctl_trap(void) +{ + struct signal_struct *signal = current->signal; + int signr = current->jobctl & JOBCTL_STOP_SIGMASK; + + if (current->ptrace & PT_SEIZED) { + if (!signal->group_stop_count && + !(signal->flags & SIGNAL_STOP_STOPPED)) + signr = SIGTRAP; + WARN_ON_ONCE(!signr); + ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), + CLD_STOPPED); + } else { + WARN_ON_ONCE(!signr); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); + current->exit_code = 0; } - - /* PTRACE_ATTACH might have raced with task killing, clear trapping */ - task_clear_group_stop_trapping(current); - - spin_unlock_irq(¤t->sighand->siglock); - - tracehook_finish_jctl(); - - return 1; } static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - if (!task_ptrace(current)) - return signr; - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ + /* + * We do not check sig_kernel_stop(signr) but set this marker + * unconditionally because we do not know whether debugger will + * change signr. This flag has no meaning unless we are going + * to stop after return from ptrace_stop(). In this case it will + * be checked in do_signal_stop(), we should only stop if it was + * not cleared by SIGCONT while we were sleeping. See also the + * comment in dequeue_signal(). + */ + current->jobctl |= JOBCTL_STOP_DEQUEUED; ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ @@ -2032,7 +2157,6 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - struct task_struct *leader; int why; if (signal->flags & SIGNAL_CLD_CONTINUED) @@ -2053,13 +2177,11 @@ relock: * a duplicate. */ read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, false, why); - leader = current->group_leader; - if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) - do_notify_parent_cldstop(leader, true, why); - + if (ptrace_reparented(current->group_leader)) + do_notify_parent_cldstop(current->group_leader, + true, why); read_unlock(&tasklist_lock); goto relock; @@ -2067,37 +2189,31 @@ relock: for (;;) { struct k_sigaction *ka; - /* - * Tracing can induce an artificial signal and choose sigaction. - * The return value in @signr determines the default action, - * but @info->si_signo is the signal number we will report. - */ - signr = tracehook_get_signal(current, regs, info, return_ka); - if (unlikely(signr < 0)) + + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && + do_signal_stop(0)) goto relock; - if (unlikely(signr != 0)) - ka = return_ka; - else { - if (unlikely(current->group_stop & - GROUP_STOP_PENDING) && do_signal_stop(0)) - goto relock; - signr = dequeue_signal(current, ¤t->blocked, - info); + if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + goto relock; + } - if (!signr) - break; /* will return 0 */ + signr = dequeue_signal(current, ¤t->blocked, info); - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); - if (!signr) - continue; - } + if (!signr) + break; /* will return 0 */ - ka = &sighand->action[signr-1]; + if (unlikely(current->ptrace) && signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; } + ka = &sighand->action[signr-1]; + /* Trace actually delivered signals. */ trace_signal_deliver(signr, info, ka); @@ -2253,7 +2369,7 @@ void exit_signals(struct task_struct *tsk) signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); - if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: @@ -2986,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask) SYSCALL_DEFINE1(ssetmask, int, newmask) { - int old; - - spin_lock_irq(¤t->sighand->siglock); - old = current->blocked.sig[0]; + int old = current->blocked.sig[0]; + sigset_t newset; - siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| - sigmask(SIGSTOP))); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); + set_current_blocked(&newset); return old; } @@ -3051,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&newset); current->state = TASK_INTERRUPTIBLE; schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 40cf63ddd4b3..fca82c32042b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -315,16 +315,24 @@ static inline void invoke_softirq(void) { if (!force_irqthreads) __do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #else static inline void invoke_softirq(void) { if (!force_irqthreads) do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #endif diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e3516b29076c..ba5070ce5765 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -19,7 +19,7 @@ #include <linux/interrupt.h> #include <linux/kallsyms.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Structure to determine completion condition and record errors. May @@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, static DEFINE_MUTEX(stop_cpus_mutex); static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); -int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static void queue_stop_cpus_work(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg, + struct cpu_stop_done *done) { struct cpu_stop_work *work; - struct cpu_stop_done done; unsigned int cpu; /* initialize works and done */ @@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) work = &per_cpu(stop_cpus_work, cpu); work->fn = fn; work->arg = arg; - work->done = &done; + work->done = done; } - cpu_stop_init_done(&done, cpumask_weight(cpumask)); /* * Disable preemption while queueing to avoid getting @@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &per_cpu(stop_cpus_work, cpu)); preempt_enable(); +} +static int __stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_done done; + + cpu_stop_init_done(&done, cpumask_weight(cpumask)); + queue_stop_cpus_work(cpumask, fn, arg, &done); wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data) struct stop_machine_data *smdata = data; enum stopmachine_state curstate = STOPMACHINE_NONE; int cpu = smp_processor_id(), err = 0; + unsigned long flags; bool is_active; + /* + * When called from stop_machine_from_inactive_cpu(), irq might + * already be disabled. Save the state and restore it on exit. + */ + local_save_flags(flags); + if (!smdata->active_cpus) is_active = cpu == cpumask_first(cpu_online_mask); else @@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data) } } while (curstate != STOPMACHINE_EXIT); - local_irq_enable(); + local_irq_restore(flags); return err; } @@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) } EXPORT_SYMBOL_GPL(stop_machine); +/** + * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * This is identical to stop_machine() but can be called from a CPU which + * is not active. The local CPU is in the process of hotplug (so no other + * CPU hotplug can start) and not marked active and doesn't have enough + * context to sleep. + * + * This function provides stop_machine() functionality for such state by + * using busy-wait for synchronization and executing @fn directly for local + * CPU. + * + * CONTEXT: + * Local CPU is inactive. Temporarily stops all active CPUs. + * + * RETURNS: + * 0 if all executions of @fn returned 0, any non zero return value if any + * returned non zero. + */ +int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, + const struct cpumask *cpus) +{ + struct stop_machine_data smdata = { .fn = fn, .data = data, + .active_cpus = cpus }; + struct cpu_stop_done done; + int ret; + + /* Local CPU must be inactive and CPU hotplug in progress. */ + BUG_ON(cpu_active(raw_smp_processor_id())); + smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ + + /* No proper task established and can't sleep - busy wait for lock. */ + while (!mutex_trylock(&stop_cpus_mutex)) + cpu_relax(); + + /* Schedule work on other CPUs and execute directly for local CPU */ + set_state(&smdata, STOPMACHINE_PREPARE); + cpu_stop_init_done(&done, num_active_cpus()); + queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, + &done); + ret = stop_machine_cpu_stop(&smdata); + + /* Busy wait for completion. */ + while (!completion_done(&done.completion)) + cpu_relax(); + + mutex_unlock(&stop_cpus_mutex); + return ret ?: done.ret; +} + #endif /* CONFIG_STOP_MACHINE */ diff --git a/kernel/sys.c b/kernel/sys.c index e4128b278f23..a101ba36c444 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> #include <linux/highuid.h> @@ -320,6 +319,37 @@ void kernel_restart_prepare(char *cmd) } /** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +/** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart * or %NULL diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f175d98bd355..11d65b531e50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1590,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } -static void free_head(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ctl_table_header, rcu)); -} - void sysctl_head_put(struct ctl_table_header *head) { spin_lock(&sysctl_lock); if (!--head->count) - call_rcu(&head->rcu, free_head); + kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } @@ -1971,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) start_unregistering(header); if (!--header->parent->count) { WARN_ON(1); - call_rcu(&header->parent->rcu, free_head); + kfree_rcu(header->parent, rcu); } if (!--header->count) - call_rcu(&header->rcu, free_head); + kfree_rcu(header, rcu); spin_unlock(&sysctl_lock); } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index fc0f22005417..e19ce1454ee1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -28,7 +28,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <net/genetlink.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Maximum length of a cpumask that can be specified in @@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; - s = NULL; if (isadd == REGISTER) { for_each_cpu(cpu, mask) { - if (!s) - s = kmalloc_node(sizeof(struct listener), - GFP_KERNEL, cpu_to_node(cpu)); + s = kmalloc_node(sizeof(struct listener), + GFP_KERNEL, cpu_to_node(cpu)); if (!s) goto cleanup; + s->pid = pid; - INIT_LIST_HEAD(&s->list); s->valid = 1; listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); - list_for_each_entry_safe(s2, tmp, &listeners->list, list) { - if (s2->pid == pid) - goto next_cpu; + list_for_each_entry(s2, &listeners->list, list) { + if (s2->pid == pid && s2->valid) + goto exists; } list_add(&s->list, &listeners->list); s = NULL; -next_cpu: +exists: up_write(&listeners->sem); + kfree(s); /* nop if NULL */ } - kfree(s); return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 342408cf68dd..2b021b0e8507 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -604,6 +604,12 @@ static struct timespec timekeeping_suspend_time; */ static void __timekeeping_inject_sleeptime(struct timespec *delta) { + if (!timespec_valid(delta)) { + printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + xtime = timespec_add(xtime, *delta); wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); total_sleep_time = timespec_add(total_sleep_time, *delta); @@ -686,12 +692,34 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { unsigned long flags; + struct timespec delta, delta_delta; + static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); timekeeping_suspended = 1; + + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec_sub(xtime, timekeeping_suspend_time); + delta_delta = timespec_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occured and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec_add(timekeeping_suspend_time, delta_delta); + } write_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ad39e556cb4..cd3134510f3d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED power:power_frequency This is for userspace compatibility and will vanish after 5 kernel iterations, - namely 2.6.41. + namely 3.1. config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3f381d0b20a8..616846bcfee5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2,7 +2,7 @@ #define _LINUX_KERNEL_TRACE_H #include <linux/fs.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/sched.h> #include <linux/clocksource.h> #include <linux/ring_buffer.h> diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa376505d..fd3c8aae55e5 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,7 +12,7 @@ #include <linux/slab.h> #include <linux/time.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "trace.h" #include "trace_output.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0400553f0d04..25fb1b0e53fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t; * per-CPU workqueues: */ struct workqueue_struct { - unsigned int flags; /* I: WQ_* flags */ + unsigned int flags; /* W: WQ_* flags */ union { struct cpu_workqueue_struct __percpu *pcpu; struct cpu_workqueue_struct *single; @@ -240,6 +240,7 @@ struct workqueue_struct { mayday_mask_t mayday_mask; /* cpus requesting rescue */ struct worker *rescuer; /* I: rescue worker */ + int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP @@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DYING) && + if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -2381,6 +2382,54 @@ out_unlock: } EXPORT_SYMBOL_GPL(flush_workqueue); +/** + * drain_workqueue - drain a workqueue + * @wq: workqueue to drain + * + * Wait until the workqueue becomes empty. While draining is in progress, + * only chain queueing is allowed. IOW, only currently pending or running + * work items on @wq can queue further work items on it. @wq is flushed + * repeatedly until it becomes empty. The number of flushing is detemined + * by the depth of chaining and should be relatively short. Whine if it + * takes too long. + */ +void drain_workqueue(struct workqueue_struct *wq) +{ + unsigned int flush_cnt = 0; + unsigned int cpu; + + /* + * __queue_work() needs to test whether there are drainers, is much + * hotter than drain_workqueue() and already looks at @wq->flags. + * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + */ + spin_lock(&workqueue_lock); + if (!wq->nr_drainers++) + wq->flags |= WQ_DRAINING; + spin_unlock(&workqueue_lock); +reflush: + flush_workqueue(wq); + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + continue; + + if (++flush_cnt == 10 || + (flush_cnt % 100 == 0 && flush_cnt <= 1000)) + pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); + goto reflush; + } + + spin_lock(&workqueue_lock); + if (!--wq->nr_drainers) + wq->flags &= ~WQ_DRAINING; + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(drain_workqueue); + static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool wait_executing) { @@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int flush_cnt = 0; unsigned int cpu; - /* - * Mark @wq dying and drain all pending works. Once WQ_DYING is - * set, only chain queueing is allowed. IOW, only currently - * pending or running work items on @wq can queue further work - * items on it. @wq is flushed repeatedly until it becomes empty. - * The number of flushing is detemined by the depth of chaining and - * should be relatively short. Whine if it takes too long. - */ - wq->flags |= WQ_DYING; -reflush: - flush_workqueue(wq); - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) - continue; - - if (++flush_cnt == 10 || - (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - printk(KERN_WARNING "workqueue %s: flush on " - "destruction isn't complete after %u tries\n", - wq->name, flush_cnt); - goto reflush; - } + /* drain it before proceeding with destruction */ + drain_workqueue(wq); /* * wq list is used to freeze wq, remove from list after |