summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/compat.c7
-rw-r--r--kernel/configs.c4
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c22
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_cmds4
-rw-r--r--kernel/debug/kdb/kdb_debugger.c21
-rw-r--r--kernel/debug/kdb/kdb_io.c36
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c94
-rw-r--r--kernel/fork.c124
-rw-r--r--kernel/futex.c60
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c18
-rw-r--r--kernel/irq/irqdomain.c180
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/lockdep.c68
-rw-r--r--kernel/module.c80
-rw-r--r--kernel/notifier.c31
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pm_qos_params.c6
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/suspend.c20
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c197
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c26
-rw-r--r--kernel/rcutree_plugin.h68
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/resource.c137
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c18
-rw-r--r--kernel/sched.c457
-rw-r--r--kernel/sched_autogroup.h1
-rw-r--r--kernel/sched_fair.c118
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_rt.c26
-rw-r--r--kernel/signal.c461
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/stop_machine.c80
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/taskstats.c20
-rw-r--r--kernel/time/timekeeping.c28
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/workqueue.c81
67 files changed, 1922 insertions, 826 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
+ select PREEMPT_COUNT
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
endchoice
+config PREEMPT_COUNT
+ bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfcc8b42..d06467fc8f7c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -125,11 +125,10 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-quiet_cmd_ikconfiggz = IKCFG $@
- cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call if_changed,ikconfiggz)
+ $(call filechk,ikconfiggz)
$(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..0a1355ca3d79 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
#include <linux/init.h>
#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -55,6 +55,9 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b126..5bf0790497e7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
atomic_inc(&tree->count);
}
-static void __put_tree(struct rcu_head *rcu)
-{
- struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
- kfree(tree);
-}
-
static inline void put_tree(struct audit_tree *tree)
{
if (atomic_dec_and_test(&tree->count))
- call_rcu(&tree->head, __put_tree);
+ kfree_rcu(tree, head);
}
/* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 00d79df03e76..ce4b054acee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
#include <linux/init.h>
#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/mm.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d115d725..1d2b6ceea95d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
*/
#include <linux/cgroup.h>
+#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
@@ -59,7 +61,7 @@
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
static DEFINE_MUTEX(cgroup_mutex);
@@ -1514,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroup *root_cgrp = &root->top_cgroup;
struct inode *inode;
struct cgroupfs_root *existing_root;
+ const struct cred *cred;
int i;
BUG_ON(sb->s_root != NULL);
@@ -1593,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
+ cred = override_creds(&init_cred);
cgroup_populate_dir(root_cgrp);
+ revert_creds(cred);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
} else {
@@ -1697,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
char *start;
struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- rcu_read_lock_held() ||
cgroup_lock_is_held());
if (!dentry || cgrp == dummytop) {
@@ -1723,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
break;
dentry = rcu_dereference_check(cgrp->dentry,
- rcu_read_lock_held() ||
cgroup_lock_is_held());
if (!cgrp->parent)
continue;
@@ -3542,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
}
/* the process need read permission on control file */
- ret = file_permission(cfile, MAY_READ);
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
if (ret < 0)
goto fail;
@@ -4813,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
- cssid = rcu_dereference_check(css->id,
- rcu_read_lock_held() || atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
if (cssid)
return cssid->id;
@@ -4826,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
{
struct css_id *cssid;
- cssid = rcu_dereference_check(css->id,
- rcu_read_lock_held() || atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
if (cssid)
return cssid->depth;
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd5..e2435ee9993a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+EXPORT_SYMBOL_GPL(put_compat_timespec);
static long compat_nanosleep_restart(struct restart_block *restart)
{
@@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
}
}
+EXPORT_SYMBOL_GPL(sigset_from_compat);
asmlinkage long
compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
sigset_from_compat(&newset, &newset32);
sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
- spin_lock_irq(&current->sighand->siglock);
current->saved_sigmask = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&newset);
current->state = TASK_INTERRUPTIBLE;
schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99d..42e8fa075eed 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
+#endif /* CONFIG_IKCONFIG_PROC */
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
-
-#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b7545c810..10131fdaff70 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
#include <linux/sort.h>
#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
@@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
int cpuset_mem_spread_node(void)
{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(&current->mems_allowed);
+
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
int cpuset_slab_spread_node(void)
{
+ if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_slab_spread_rotor =
+ node_random(&current->mems_allowed);
+
return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee88..0d7c08784efb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/system.h>
#include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd62..34872482315e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
/* Our I/O buffers. */
static char remcom_in_buffer[BUFMAX];
static char remcom_out_buffer[BUFMAX];
+static int gdbstub_use_prev_in_buf;
+static int gdbstub_prev_in_buf_pos;
/* Storage for the registers, in GDB format. */
static unsigned long gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
int ret = -1;
int i;
+ if (unlikely(gdbstub_use_prev_in_buf)) {
+ if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+ return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+ else
+ gdbstub_use_prev_in_buf = 0;
+ }
+
/* poll any additional I/O interfaces that are defined */
while (ret < 0)
for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
buffer[count] = ch;
count = count + 1;
}
- buffer[count] = 0;
if (ch == '#') {
xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
if (dbg_io_ops->flush)
dbg_io_ops->flush();
}
+ buffer[count] = 0;
} while (checksum != xmitcsum);
}
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
case 'c':
strcpy(remcom_in_buffer, cmd);
return 0;
- case '?':
- gdb_cmd_status(ks);
- break;
- case '\0':
- strcpy(remcom_out_buffer, "");
- break;
+ case '$':
+ strcpy(remcom_in_buffer, cmd);
+ gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
+ gdbstub_prev_in_buf_pos = 0;
+ return 0;
}
dbg_io_ops->write_char('+');
put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16a..7179eac7b41c 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
unsigned long addr;
long offset;
- kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
- kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
- * proc in bta */
+ /* Prompt after each proc in bta */
+ kdbgetintenv("BTAPROMPT", &btaprompt);
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db309..9834ad303ab6 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
endefcmd
defcmd dumpall "" "First line debugging"
- set BTSYMARG 1
- set BTARGS 9
pid R
-dumpcommon
-bta
endefcmd
defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
- set BTSYMARG 1
- set BTARGS 9
pid R
-dumpcommon
-btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02c..d9ca9aa481ec 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
int kdb_poll_idx = 1;
EXPORT_SYMBOL_GPL(kdb_poll_idx);
+static struct kgdb_state *kdb_ks;
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
kdb_dbtrap_t db_result = KDB_DB_NOBPT;
int i;
+ kdb_ks = ks;
if (KDB_STATE(REENTRY)) {
reason = KDB_REASON_SWITCH;
KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
- if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
- /*
- * This inteface glue which allows kdb to transition in into
- * the gdb stub. In order to do this the '?' or '' gdb serial
- * packet response is processed here. And then control is
- * passed to the gdbstub.
- */
- if (KDB_STATE(DOING_KGDB))
- gdbstub_state(ks, "?");
- else
- gdbstub_state(ks, "");
+ if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
- KDB_STATE_CLEAR(DOING_KGDB2);
- }
return DBG_PASS_EVENT;
}
kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
return kgdb_info[ks->cpu].ret_state;
}
+void kdb_gdb_state_pass(char *buf)
+{
+ gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a80..4802eb5840e1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk;
-static void kgdb_transition_check(char *buffer)
+static int kgdb_transition_check(char *buffer)
{
- int slen = strlen(buffer);
- if (strncmp(buffer, "$?#3f", slen) != 0 &&
- strncmp(buffer, "$qSupported#37", slen) != 0 &&
- strncmp(buffer, "+$qSupported#37", slen) != 0) {
+ if (buffer[0] != '+' && buffer[0] != '$') {
KDB_STATE_SET(KGDB_TRANS);
kdb_printf("%s", buffer);
+ } else {
+ int slen = strlen(buffer);
+ if (slen > 3 && buffer[slen - 3] == '#') {
+ kdb_gdb_state_pass(buffer);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return 1;
+ }
}
+ return 0;
}
static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
case 13: /* enter */
*lastchar++ = '\n';
*lastchar++ = '\0';
+ if (!KDB_STATE(KGDB_TRANS)) {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ }
kdb_printf("\n");
return buffer;
case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
* printed characters if we think that
* kgdb is connecting, until the check
* fails */
- if (!KDB_STATE(KGDB_TRANS))
- kgdb_transition_check(buffer);
- else
+ if (!KDB_STATE(KGDB_TRANS)) {
+ if (kgdb_transition_check(buffer))
+ return buffer;
+ } else {
kdb_printf("%c", key);
+ }
}
/* Special escape to kgdb */
if (lastchar - buffer >= 5 &&
strcmp(lastchar - 5, "$?#3f") == 0) {
+ kdb_gdb_state_pass(lastchar - 5);
strcpy(buffer, "kgdb");
KDB_STATE_SET(DOING_KGDB);
return buffer;
}
- if (lastchar - buffer >= 14 &&
- strcmp(lastchar - 14, "$qSupported#37") == 0) {
+ if (lastchar - buffer >= 11 &&
+ strcmp(lastchar - 11, "$qSupported") == 0) {
+ kdb_gdb_state_pass(lastchar - 11);
strcpy(buffer, "kgdb");
- KDB_STATE_SET(DOING_KGDB2);
+ KDB_STATE_SET(DOING_KGDB);
return buffer;
}
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef6..63786e71a3cd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
#endif
"RADIX=16",
"MDCOUNT=8", /* lines of md output */
- "BTARGS=9", /* 9 possible args in bt */
KDB_PLATFORM_ENV,
"DTABCOUNT=30",
"NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
(char *)0,
(char *)0,
(char *)0,
+ (char *)0,
};
static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
}
if (result == KDB_CMD_KGDB) {
- if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+ if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
"or use $D#44+ or $3#33\n");
break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb5..e381d105b40b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
#define KDB_CMD_SS (-1003)
#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
-#define KDB_CMD_KGDB2 (-1006)
/* Internal debug flags */
#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
* keyboard on this cpu */
#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
-#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
* specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
extern char *kdb_getstr(char *, size_t, char *);
+extern void kdb_gdb_state_pass(char *buf);
/* Defines for kdb_symbol_print */
#define KDB_SP_SPACEB 0x0001 /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
+#include <linux/module.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/exit.c b/kernel/exit.c
index f2b321bae440..2913b3509d42 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
struct tty_struct *uninitialized_var(tty);
sighand = rcu_dereference_check(tsk->sighand,
- rcu_read_lock_held() ||
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
@@ -169,7 +168,6 @@ void release_task(struct task_struct * p)
struct task_struct *leader;
int zap_leader;
repeat:
- tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -179,7 +177,7 @@ repeat:
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- tracehook_finish_release_task(p);
+ ptrace_release_task(p);
__exit_signal(p);
/*
@@ -190,22 +188,12 @@ repeat:
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
- BUG_ON(task_detached(leader));
- do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
- *
- * do_notify_parent() will have marked it self-reaping in
- * that case.
- */
- zap_leader = task_detached(leader);
-
- /*
- * This maintains the invariant that release_task()
- * only runs on a task in EXIT_DEAD, just for sanity.
*/
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
@@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void)
return retval;
}
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
{
- int retval = 0;
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (!task_is_stopped(p))
- continue;
- retval = 1;
- break;
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return retval;
+
+ return false;
}
/*
@@ -751,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
{
list_move_tail(&p->sibling, &p->real_parent->children);
- if (task_detached(p))
+ if (p->exit_state == EXIT_DEAD)
return;
/*
* If this is a threaded reparent there is no need to
@@ -764,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!task_ptrace(p) &&
+ if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
- do_notify_parent(p, p->exit_signal);
- if (task_detached(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
@@ -794,7 +779,7 @@ static void forget_original_parent(struct task_struct *father)
do {
t->real_parent = reaper;
if (t->parent == father) {
- BUG_ON(task_ptrace(t));
+ BUG_ON(t->ptrace);
t->parent = t->real_parent;
}
if (t->pdeath_signal)
@@ -819,8 +804,7 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int signal;
- void *cookie;
+ bool autoreap;
/*
* This does two things:
@@ -851,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* we have changed execution domain as these two values started
* the same after a fork.
*/
- if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
- signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal >= 0)
- signal = do_notify_parent(tsk, signal);
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
- tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- tracehook_report_death(tsk, signal, cookie, group_dead);
-
/* If the process is dead, release it - nobody will wait for it */
- if (signal == DEATH_REAP)
+ if (autoreap)
release_task(tsk);
}
@@ -906,7 +897,6 @@ NORET_TYPE void do_exit(long code)
profile_task_exit(tsk);
- WARN_ON(atomic_read(&tsk->fs_excl));
WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
@@ -923,7 +913,7 @@ NORET_TYPE void do_exit(long code)
*/
set_fs(USER_DS);
- tracehook_report_exit(&code);
+ ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -990,6 +980,7 @@ NORET_TYPE void do_exit(long code)
trace_sched_process_exit(tsk);
exit_sem(tsk);
+ exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
check_stack_usage();
@@ -1235,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
- * !task_detached() to filter out sub-threads.
+ * thread_group_leader() to filter out sub-threads.
*/
- if (likely(!traced) && likely(!task_detached(p))) {
+ if (likely(!traced) && thread_group_leader(p)) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
@@ -1345,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
/*
- * If this is not a detached task, notify the parent.
- * If it's still not detached after that, don't release
- * it now.
+ * If this is not a sub-thread, notify the parent.
+ * If parent wants a zombie, don't release it now.
*/
- if (!task_detached(p)) {
- do_notify_parent(p, p->exit_signal);
- if (!task_detached(p)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+ if (thread_group_leader(p) &&
+ !do_notify_parent(p, p->exit_signal)) {
+ p->exit_state = EXIT_ZOMBIE;
+ p = NULL;
}
write_unlock_irq(&tasklist_lock);
}
@@ -1367,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p))
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1563,7 +1552,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Notification and reaping will be cascaded to the real
* parent when the ptracer detaches.
*/
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/* it will become visible, clear notask_error */
wo->notask_error = 0;
return 0;
@@ -1606,8 +1595,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* own children, it should create a separate process which
* takes the role of real parent.
*/
- if (likely(!ptrace) && task_ptrace(p) &&
- same_thread_group(p->parent, p->real_parent))
+ if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
return 0;
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..e7ceaca89609 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
-#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
@@ -81,7 +80,7 @@
* Protected counters by write_lock_irq(&tasklist_lock)
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
-int nr_threads; /* The idle threads do not count.. */
+int nr_threads; /* The idle threads do not count.. */
int max_threads; /* tunable limit on nr_threads */
@@ -233,7 +232,7 @@ void __init fork_init(unsigned long mempages)
/*
* we need to allow at least 20 threads to boot a system
*/
- if(max_threads < 20)
+ if (max_threads < 20)
max_threads = 20;
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -269,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
return NULL;
}
- err = arch_dup_task_struct(tsk, orig);
+ err = arch_dup_task_struct(tsk, orig);
if (err)
goto out;
@@ -289,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->stack_canary = get_random_int();
#endif
- /* One for us, one for whoever does the "release_task()" (usually parent) */
- atomic_set(&tsk->usage,2);
- atomic_set(&tsk->fs_excl, 0);
+ /*
+ * One for us, one for whoever does the "release_task()" (usually
+ * parent)
+ */
+ atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -439,7 +440,7 @@ fail_nomem:
goto out;
}
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
if (unlikely(!mm->pgd))
@@ -447,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
return 0;
}
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
{
pgd_free(mm, mm->pgd);
}
@@ -484,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
@@ -515,9 +516,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
/*
* Allocate and initialize an mm_struct.
*/
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
{
- struct mm_struct * mm;
+ struct mm_struct *mm;
mm = allocate_mm();
if (!mm)
@@ -585,7 +586,7 @@ void added_exe_file_vma(struct mm_struct *mm)
void removed_exe_file_vma(struct mm_struct *mm)
{
mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+ if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
fput(mm->exe_file);
mm->exe_file = NULL;
}
@@ -777,9 +778,9 @@ fail_nocontext:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
- struct mm_struct * mm, *oldmm;
+ struct mm_struct *mm, *oldmm;
int retval;
tsk->min_flt = tsk->maj_flt = 0;
@@ -846,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1013,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+ plist_head_init(&p->pi_waiters);
p->pi_blocked_on = NULL;
#endif
}
@@ -1168,13 +1169,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
- if (IS_ERR(p->mempolicy)) {
- retval = PTR_ERR(p->mempolicy);
- p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
- }
+ if (IS_ERR(p->mempolicy)) {
+ retval = PTR_ERR(p->mempolicy);
+ p->mempolicy = NULL;
+ goto bad_fork_cleanup_cgroup;
+ }
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+ p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1214,25 +1219,33 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
-
- if ((retval = audit_alloc(p)))
+ retval = audit_alloc(p);
+ if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
- if ((retval = copy_semundo(clone_flags, p)))
+ retval = copy_semundo(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_audit;
- if ((retval = copy_files(clone_flags, p)))
+ retval = copy_files(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_semundo;
- if ((retval = copy_fs(clone_flags, p)))
+ retval = copy_fs(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_files;
- if ((retval = copy_sighand(clone_flags, p)))
+ retval = copy_sighand(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_fs;
- if ((retval = copy_signal(clone_flags, p)))
+ retval = copy_signal(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_sighand;
- if ((retval = copy_mm(clone_flags, p)))
+ retval = copy_mm(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_signal;
- if ((retval = copy_namespaces(clone_flags, p)))
+ retval = copy_namespaces(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_mm;
- if ((retval = copy_io(clone_flags, p)))
+ retval = copy_io(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
@@ -1254,7 +1267,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1322,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
- */
+ */
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
@@ -1340,7 +1353,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (likely(p->pid)) {
- tracehook_finish_clone(p, clone_flags, trace);
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
if (is_child_reaper(pid))
@@ -1481,10 +1494,22 @@ long do_fork(unsigned long clone_flags,
}
/*
- * When called from kernel_thread, don't do user tracing stuff.
+ * Determine whether and which event to report to ptracer. When
+ * called from kernel_thread or CLONE_UNTRACED is explicitly
+ * requested, no event is reported; otherwise, report if the event
+ * for the type of forking is enabled.
*/
- if (likely(user_mode(regs)))
- trace = tracehook_prepare_clone(clone_flags);
+ if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (clone_flags & CLONE_VFORK)
+ trace = PTRACE_EVENT_VFORK;
+ else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ trace = PTRACE_EVENT_CLONE;
+ else
+ trace = PTRACE_EVENT_FORK;
+
+ if (likely(!ptrace_event_enabled(current, trace)))
+ trace = 0;
+ }
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace);
@@ -1508,26 +1533,26 @@ long do_fork(unsigned long clone_flags,
}
audit_finish_fork(p);
- tracehook_report_clone(regs, clone_flags, nr, p);
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
- * hasn't gotten to tracehook_report_clone() yet. Now we
- * clear it and set the child going.
+ * hasn't finished SIGSTOP raising yet. Now we clear it
+ * and set the child going.
*/
p->flags &= ~PF_STARTING;
wake_up_new_task(p);
- tracehook_report_clone_complete(trace, regs,
- clone_flags, nr, p);
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event(trace, nr);
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
- tracehook_report_vfork_done(p, nr);
+ ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
}
} else {
nr = PTR_ERR(p);
@@ -1574,6 +1599,7 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
+ nsproxy_cache_init();
}
/*
@@ -1670,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_fs(unshare_flags, &new_fs)))
+ err = unshare_fs(unshare_flags, &new_fs);
+ if (err)
goto bad_unshare_out;
- if ((err = unshare_fd(unshare_flags, &new_fd)))
+ err = unshare_fd(unshare_flags, &new_fd);
+ if (err)
goto bad_unshare_cleanup_fs;
- if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
- new_fs)))
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ if (err)
goto bad_unshare_cleanup_fd;
if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282eae..11cbe052b2e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ,
+ * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *page_head;
- int err;
+ int err, ro = 0;
/*
* The futex address must be "naturally" aligned.
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
again:
err = get_user_pages_fast(address, 1, 1, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == VERIFY_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
if (err < 0)
return err;
+ else
+ err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
page_head = page;
@@ -305,6 +317,13 @@ again:
if (!page_head->mapping) {
unlock_page(page_head);
put_page(page_head);
+ /*
+ * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
+ * trying to find one. RW mapping would have COW'd (and thus
+ * have a mapping) so this page is RO and won't ever change.
+ */
+ if ((page_head == ZERO_PAGE(address)))
+ return -EFAULT;
goto again;
}
@@ -316,6 +335,15 @@ again:
* the object not the particular process.
*/
if (PageAnon(page_head)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
@@ -327,9 +355,10 @@ again:
get_futex_key_refs(key);
+out:
unlock_page(page_head);
put_page(page_head);
- return 0;
+ return err;
}
static inline void put_futex_key(union futex_key *key)
@@ -355,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, (unsigned long)uaddr,
- 1, 1, 0, NULL, NULL);
+ ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1243,10 +1272,11 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
if (unlikely(ret != 0))
return ret;
@@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
}
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2060,7 +2090,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2697,7 +2727,7 @@ static int __init futex_init(void)
futex_cmpxchg_enabled = 1;
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+ plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 5bf924d80b5c..a92028196cc1 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS
+ select CONSTRUCTORS if !UML
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1d051b38e0b..5a38bf4de641 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER
config GENERIC_IRQ_CHIP
bool
+# Generic irq_domain hw <--> linux irq number translation
+config IRQ_DOMAIN
+ bool
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 73290056cfb6..fff17381f0af 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,7 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa55..bd8e788d71e0 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
struct irq_devres match_data = { irq, dev_id };
- free_irq(irq, dev_id);
WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
&match_data));
+ free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
}
/**
- * irq_gc_ack - Ack pending interrupt
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
* @d: irq_data
*/
-void irq_gc_ack(struct irq_data *d)
+void irq_gc_ack_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
u32 mask = 1 << (d->irq - gc->irq_base);
@@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d)
}
/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = ~(1 << (d->irq - gc->irq_base));
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_gc_unlock(gc);
+}
+
+/**
* irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
* @d: irq_data
*/
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 000000000000..d5828da3fd38
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,180 @@
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+
+static LIST_HEAD(irq_domain_list);
+static DEFINE_MUTEX(irq_domain_mutex);
+
+/**
+ * irq_domain_add() - Register an irq_domain
+ * @domain: ptr to initialized irq_domain structure
+ *
+ * Registers an irq_domain structure. The irq_domain must at a minimum be
+ * initialized with an ops structure pointer, and either a ->to_irq hook or
+ * a valid irq_base value. Everything else is optional.
+ */
+void irq_domain_add(struct irq_domain *domain)
+{
+ struct irq_data *d;
+ int hwirq;
+
+ /*
+ * This assumes that the irq_domain owner has already allocated
+ * the irq_descs. This block will be removed when support for dynamic
+ * allocation of irq_descs is added to irq_domain.
+ */
+ for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+ d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+ if (d || d->domain) {
+ /* things are broken; just report, don't clean up */
+ WARN(1, "error: irq_desc already assigned to a domain");
+ return;
+ }
+ d->domain = domain;
+ d->hwirq = hwirq;
+ }
+
+ mutex_lock(&irq_domain_mutex);
+ list_add(&domain->list, &irq_domain_list);
+ mutex_unlock(&irq_domain_mutex);
+}
+
+/**
+ * irq_domain_del() - Unregister an irq_domain
+ * @domain: ptr to registered irq_domain.
+ */
+void irq_domain_del(struct irq_domain *domain)
+{
+ struct irq_data *d;
+ int hwirq;
+
+ mutex_lock(&irq_domain_mutex);
+ list_del(&domain->list);
+ mutex_unlock(&irq_domain_mutex);
+
+ /* Clear the irq_domain assignments */
+ for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+ d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+ d->domain = NULL;
+ }
+}
+
+#if defined(CONFIG_OF_IRQ)
+/**
+ * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ *
+ * Used by the device tree interrupt mapping code to translate a device tree
+ * interrupt specifier to a valid linux irq number. Returns either a valid
+ * linux IRQ number or 0.
+ *
+ * When the caller no longer need the irq number returned by this function it
+ * should arrange to call irq_dispose_mapping().
+ */
+unsigned int irq_create_of_mapping(struct device_node *controller,
+ const u32 *intspec, unsigned int intsize)
+{
+ struct irq_domain *domain;
+ unsigned long hwirq;
+ unsigned int irq, type;
+ int rc = -EINVAL;
+
+ /* Find a domain which can translate the irq spec */
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(domain, &irq_domain_list, list) {
+ if (!domain->ops->dt_translate)
+ continue;
+ rc = domain->ops->dt_translate(domain, controller,
+ intspec, intsize, &hwirq, &type);
+ if (rc == 0)
+ break;
+ }
+ mutex_unlock(&irq_domain_mutex);
+
+ if (rc != 0)
+ return 0;
+
+ irq = irq_domain_to_irq(domain, hwirq);
+ if (type != IRQ_TYPE_NONE)
+ irq_set_irq_type(irq, type);
+ pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
+ controller->full_name, (int)hwirq, irq, type);
+ return irq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+/**
+ * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
+ * @irq: linux irq number to be discarded
+ *
+ * Calling this function indicates the caller no longer needs a reference to
+ * the linux irq number returned by a prior call to irq_create_of_mapping().
+ */
+void irq_dispose_mapping(unsigned int irq)
+{
+ /*
+ * nothing yet; will be filled when support for dynamic allocation of
+ * irq_descs is added to irq_domain
+ */
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+
+int irq_domain_simple_dt_translate(struct irq_domain *d,
+ struct device_node *controller,
+ const u32 *intspec, unsigned int intsize,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (d->of_node != controller)
+ return -EINVAL;
+ if (intsize < 1)
+ return -EINVAL;
+
+ *out_hwirq = intspec[0];
+ *out_type = IRQ_TYPE_NONE;
+ if (intsize > 1)
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+
+struct irq_domain_ops irq_domain_simple_ops = {
+ .dt_translate = irq_domain_simple_dt_translate,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+/**
+ * irq_domain_create_simple() - Set up a 'simple' translation range
+ */
+void irq_domain_add_simple(struct device_node *controller, int irq_base)
+{
+ struct irq_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain) {
+ WARN_ON(1);
+ return;
+ }
+
+ domain->irq_base = irq_base;
+ domain->of_node = of_node_get(controller);
+ domain->ops = &irq_domain_simple_ops;
+ irq_domain_add(domain);
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+
+void irq_domain_generate_simple(const struct of_device_id *match,
+ u64 phys_base, unsigned int irq_start)
+{
+ struct device_node *node;
+ pr_info("looking for phys_base=%llx, irq_start=%i\n",
+ (unsigned long long) phys_base, (int) irq_start);
+ node = of_find_matching_node_by_address(NULL, match, phys_base);
+ if (node)
+ irq_domain_add_simple(node, irq_start);
+ else
+ pr_info("no node found\n");
+}
+EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
+#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8d814cbc8109..296fbc84d659 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
size_t size = 0;
mutex_lock(&kexec_mutex);
if (crashk_res.end != crashk_res.start)
- size = crashk_res.end - crashk_res.start + 1;
+ size = resource_size(&crashk_res);
mutex_unlock(&kexec_mutex);
return size;
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 47613dfb7b28..ddc7644c1305 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -274,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work)
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
*/
-static int usermodehelper_disabled;
+static int usermodehelper_disabled = 1;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 298c9276dfdb..8c24294e477f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+ continue;
+
if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@@ -2478,34 +2481,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
/*
* Hardirqs will be enabled:
*/
-void trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(unsigned long ip)
{
struct task_struct *curr = current;
- time_hardirqs_on(CALLER_ADDR0, ip);
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
- return;
-
- if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
- return;
-
- if (unlikely(curr->hardirqs_enabled)) {
- /*
- * Neither irq nor preemption are disabled here
- * so this is racy by nature but losing one hit
- * in a stat is not a big deal.
- */
- __debug_atomic_inc(redundant_hardirqs_on);
- return;
- }
/* we'll do an OFF -> ON transition: */
curr->hardirqs_enabled = 1;
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return;
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
- return;
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
@@ -2525,6 +2507,37 @@ void trace_hardirqs_on_caller(unsigned long ip)
curr->hardirq_enable_event = ++curr->irq_events;
debug_atomic_inc(hardirqs_on_events);
}
+
+void trace_hardirqs_on_caller(unsigned long ip)
+{
+ time_hardirqs_on(CALLER_ADDR0, ip);
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (unlikely(current->hardirqs_enabled)) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+
+ current->lockdep_recursion = 1;
+ __trace_hardirqs_on_caller(ip);
+ current->lockdep_recursion = 0;
+}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
void trace_hardirqs_on(void)
@@ -2574,7 +2587,7 @@ void trace_softirqs_on(unsigned long ip)
{
struct task_struct *curr = current;
- if (unlikely(!debug_locks))
+ if (unlikely(!debug_locks || current->lockdep_recursion))
return;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2585,6 +2598,7 @@ void trace_softirqs_on(unsigned long ip)
return;
}
+ current->lockdep_recursion = 1;
/*
* We'll do an OFF -> ON transition:
*/
@@ -2599,6 +2613,7 @@ void trace_softirqs_on(unsigned long ip)
*/
if (curr->hardirqs_enabled)
mark_held_locks(curr, SOFTIRQ);
+ current->lockdep_recursion = 0;
}
/*
@@ -2608,7 +2623,7 @@ void trace_softirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
- if (unlikely(!debug_locks))
+ if (unlikely(!debug_locks || current->lockdep_recursion))
return;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2859,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- int i;
-
- for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
- lock->class_cache[i] = NULL;
+ memset(lock, 0, sizeof(*lock));
#ifdef CONFIG_LOCK_STAT
lock->cpu = raw_smp_processor_id();
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3f..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
mod->field = kstrdup(s, GFP_KERNEL); \
} \
static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
- struct module *mod, char *buffer) \
+ struct module_kobject *mk, char *buffer) \
{ \
- return sprintf(buffer, "%s\n", mod->field); \
+ return sprintf(buffer, "%s\n", mk->mod->field); \
} \
static int modinfo_##field##_exists(struct module *mod) \
{ \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
- struct module *mod, char *buffer)
+ struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", module_refcount(mod));
+ return sprintf(buffer, "%u\n", module_refcount(mk->mod));
}
static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
#endif /* CONFIG_MODULE_UNLOAD */
static ssize_t show_initstate(struct module_attribute *mattr,
- struct module *mod, char *buffer)
+ struct module_kobject *mk, char *buffer)
{
const char *state = "unknown";
- switch (mod->state) {
+ switch (mk->mod->state) {
case MODULE_STATE_LIVE:
state = "live";
break;
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
.show = show_initstate,
};
+static ssize_t store_uevent(struct module_attribute *mattr,
+ struct module_kobject *mk,
+ const char *buffer, size_t count)
+{
+ enum kobject_action action;
+
+ if (kobject_action_type(buffer, count, &action) == 0)
+ kobject_uevent(&mk->kobj, action);
+ return count;
+}
+
+struct module_attribute module_uevent = {
+ .attr = { .name = "uevent", .mode = 0200 },
+ .store = store_uevent,
+};
+
static struct module_attribute *modinfo_attrs[] = {
&modinfo_version,
&modinfo_srcversion,
&initstate,
+ &module_uevent,
#ifdef CONFIG_MODULE_UNLOAD
&refcnt,
#endif
@@ -1187,7 +1204,7 @@ struct module_sect_attrs
};
static ssize_t module_sect_show(struct module_attribute *mattr,
- struct module *mod, char *buf)
+ struct module_kobject *mk, char *buf)
{
struct module_sect_attr *sattr =
container_of(mattr, struct module_sect_attr, mattr);
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
static void unset_module_init_ro_nx(struct module *mod) { }
#endif
+void __weak module_free(struct module *mod, void *module_region)
+{
+ vfree(module_region);
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
return ret;
}
+int __weak apply_relocate(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ pr_err("module %s: REL relocation unsupported\n", me->name);
+ return -ENOEXEC;
+}
+
+int __weak apply_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ pr_err("module %s: RELA relocation unsupported\n", me->name);
+ return -ENOEXEC;
+}
+
static int apply_relocations(struct module *mod, const struct load_info *info)
{
unsigned int i;
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
ddebug_remove_module(debug->modname);
}
+void * __weak module_alloc(unsigned long size)
+{
+ return size == 0 ? NULL : vmalloc_exec(size);
+}
+
static void *module_alloc_update_bounds(unsigned long size)
{
void *ret = module_alloc(size);
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
set_fs(old_fs);
}
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+
static struct module *layout_and_allocate(struct load_info *info)
{
/* Module within temporary copy. */
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
module_free(mod, mod->module_core);
}
+int __weak module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
static int post_relocation(struct module *mod, const struct load_info *info)
{
/* Sort exception table now relocations are done. */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb568..8d7b435806c9 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-/**
- * register_reboot_notifier - Register function to be called at reboot time
- * @nb: Info about notifier function to be called
- *
- * Registers a function with the list of functions
- * to be called at reboot time.
- *
- * Currently always returns zero, as blocking_notifier_chain_register()
- * always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-
-/**
- * unregister_reboot_notifier - Unregister previously registered reboot notifier
- * @nb: Hook to be unregistered
- *
- * Unregisters a previously registered reboot
- * notifier function.
- *
- * Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15d..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
return err;
}
-static int __init nsproxy_cache_init(void)
+int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
return 0;
}
-
-module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb95..d7bb6974efb5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...)
}
mdelay(PANIC_TIMER_STEP);
}
+ }
+ if (panic_timeout != 0) {
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
diff --git a/kernel/params.c b/kernel/params.c
index ed72e1330862..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
int ret; \
\
ret = strtolfn(val, 0, &l); \
- if (ret == -EINVAL || ((type)l != l)) \
- return -EINVAL; \
+ if (ret < 0 || ((type)l != l)) \
+ return ret < 0 ? ret : -EINVAL; \
*((type *)kp->arg) = l; \
return 0; \
} \
@@ -511,7 +511,7 @@ struct module_param_attrs
#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
static ssize_t param_attr_show(struct module_attribute *mattr,
- struct module *mod, char *buf)
+ struct module_kobject *mk, char *buf)
{
int count;
struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
static ssize_t param_attr_store(struct module_attribute *mattr,
- struct module *owner,
+ struct module_kobject *km,
const char *buf, size_t len)
{
int err;
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
mk->kobj.kset = module_kset;
err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
"%s", name);
+#ifdef CONFIG_MODULES
+ if (!err)
+ err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+#endif
if (err) {
kobject_put(&mk->kobj);
printk(KERN_ERR
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void)
}
ssize_t __modver_version_show(struct module_attribute *mattr,
- struct module *mod, char *buf)
+ struct module_kobject *mk, char *buf)
{
struct module_version_attribute *vattr =
container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- ret = attribute->show(attribute, mk->mod, buf);
+ ret = attribute->show(attribute, mk, buf);
return ret;
}
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- ret = attribute->store(attribute, mk->mod, buf, len);
+ ret = attribute->store(attribute, mk, buf, len);
return ret;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270e..e432057f3b21 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
if (pid) {
struct hlist_node *first;
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
- rcu_read_lock_held() ||
lockdep_tasklist_lock_is_held());
if (first)
result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0c..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
static struct pm_qos_object null_pm_qos;
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_object cpu_dma_pm_qos = {
- .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
+ .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_object network_lat_pm_qos = {
- .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
+ .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
.notifiers = &network_lat_notifier,
.name = "network_latency",
.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_object network_throughput_pm_qos = {
- .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
+ .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 87f4d24b55b0..b1914cb9095c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -193,8 +193,8 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/pm.txt> and the
- Battery Powered Linux mini-HOWTO, available from
+ and more information, read <file:Documentation/power/apm-acpi.txt>
+ and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
This driver does not spin down disk drives (see the hdparm(8)
@@ -224,6 +224,10 @@ config PM_OPP
implementations a ready to use framework to manage OPPs.
For more information, read <file:Documentation/power/opp.txt>
-config PM_RUNTIME_CLK
+config PM_CLK
def_bool y
- depends on PM_RUNTIME && HAVE_CLK
+ depends on PM && HAVE_CLK
+
+config PM_GENERIC_DOMAINS
+ bool
+ depends on PM
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2981af4ce7cb..6c601f871964 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
int pm_notifier_call_chain(unsigned long val)
{
- return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
- == NOTIFY_BAD) ? -EINVAL : 0;
+ int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+
+ return notifier_to_errno(ret);
}
/* If set, devices may be suspended and resumed asynchronously. */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ace55889f702..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1211,7 +1211,11 @@ static void free_unnecessary_pages(void)
to_free_highmem = alloc_highmem - save;
} else {
to_free_highmem = 0;
- to_free_normal -= save - alloc_highmem;
+ save -= alloc_highmem;
+ if (to_free_normal > save)
+ to_free_normal -= save;
+ else
+ to_free_normal = 0;
}
memory_bm_position_reset(&copy_bm);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba215419..b6b71ad2208f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
suspend_ops = ops;
mutex_unlock(&pm_mutex);
}
+EXPORT_SYMBOL_GPL(suspend_set_ops);
bool valid_state(suspend_state_t state)
{
@@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state)
{
return state == PM_SUSPEND_MEM;
}
+EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static int suspend_test(int level)
{
@@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
}
/**
- * suspend_enter - enter the desired system sleep state.
- * @state: state to enter
+ * suspend_enter - enter the desired system sleep state.
+ * @state: State to enter
+ * @wakeup: Returns information that suspend should not be entered again.
*
- * This function should be called after devices have been suspended.
+ * This function should be called after devices have been suspended.
*/
-static int suspend_enter(suspend_state_t state)
+static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
@@ -165,7 +168,8 @@ static int suspend_enter(suspend_state_t state)
error = syscore_suspend();
if (!error) {
- if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
+ *wakeup = pm_wakeup_pending();
+ if (!(suspend_test(TEST_CORE) || *wakeup)) {
error = suspend_ops->enter(state);
events_check_enabled = false;
}
@@ -199,6 +203,7 @@ static int suspend_enter(suspend_state_t state)
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
+ bool wakeup = false;
if (!suspend_ops)
return -ENOSYS;
@@ -220,7 +225,10 @@ int suspend_devices_and_enter(suspend_state_t state)
if (suspend_test(TEST_DEVICES))
goto Recover_platform;
- error = suspend_enter(state);
+ do {
+ error = suspend_enter(state, &wakeup);
+ } while (!error && !wakeup
+ && suspend_ops->suspend_again && suspend_ops->suspend_again());
Resume_devices:
suspend_test_start();
diff --git a/kernel/printk.c b/kernel/printk.c
index 35185392173f..37dff3429adb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -782,7 +782,7 @@ static inline int can_use_console(unsigned int cpu)
static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
- int retval = 0;
+ int retval = 0, wake = 0;
if (console_trylock()) {
retval = 1;
@@ -795,12 +795,14 @@ static int console_trylock_for_printk(unsigned int cpu)
*/
if (!can_use_console(cpu)) {
console_locked = 0;
- up(&console_sem);
+ wake = 1;
retval = 0;
}
}
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
+ if (wake)
+ up(&console_sem);
return retval;
}
static const char recursion_bug_msg [] =
@@ -1242,7 +1244,7 @@ void console_unlock(void)
{
unsigned long flags;
unsigned _con_start, _log_end;
- unsigned wake_klogd = 0;
+ unsigned wake_klogd = 0, retry = 0;
if (console_suspended) {
up(&console_sem);
@@ -1251,6 +1253,7 @@ void console_unlock(void)
console_may_schedule = 0;
+again:
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
@@ -1271,8 +1274,23 @@ void console_unlock(void)
if (unlikely(exclusive_console))
exclusive_console = NULL;
+ spin_unlock(&logbuf_lock);
+
up(&console_sem);
+
+ /*
+ * Someone could have filled up the buffer again, so re-check if there's
+ * something to flush. In case we cannot trylock the console_sem again,
+ * there's a new owner and the console_unlock() from them will do the
+ * flush, no worries.
+ */
+ spin_lock(&logbuf_lock);
+ if (con_start != log_end)
+ retry = 1;
spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (retry && console_trylock())
+ goto again;
+
if (wake_klogd)
wake_up_klogd();
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd9..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
#include <linux/uaccess.h>
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+ schedule();
+ return 0;
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child)
spin_lock(&child->sighand->siglock);
/*
- * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+ * Clear all pending traps and TRAPPING. TRAPPING should be
+ * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
+ */
+ task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+ task_clear_jobctl_trapping(child);
+
+ /*
+ * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
* @child isn't dead.
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
child->signal->group_stop_count))
- child->group_stop |= GROUP_STOP_PENDING;
+ child->jobctl |= JOBCTL_STOP_PENDING;
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child)
* is in TASK_TRACED; otherwise, we might unduly disrupt
* TASK_KILLABLE sleeps.
*/
- if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
signal_wake_up(child, task_is_traced(child));
spin_unlock(&child->sighand->siglock);
}
-/*
- * Check that we have indeed attached to the thing..
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations. If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing. If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
spin_lock_irq(&child->sighand->siglock);
WARN_ON_ONCE(task_is_stopped(child));
- if (task_is_traced(child) || kill)
+ if (ignore_state || (task_is_traced(child) &&
+ !(child->jobctl & JOBCTL_LISTENING)))
ret = 0;
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !kill)
+ if (!ret && !ignore_state)
ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
/* All systems go.. */
@@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long flags)
{
- bool wait_trap = false;
+ bool seize = (request == PTRACE_SEIZE);
int retval;
+ /*
+ * SEIZE will enable new ptrace behaviors which will be implemented
+ * gradually. SEIZE_DEVEL is used to prevent applications
+ * expecting full SEIZE behaviors trapping on kernel commits which
+ * are still in the process of implementing them.
+ *
+ * Only test programs for new ptrace behaviors being implemented
+ * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
+ *
+ * Once SEIZE behaviors are completely implemented, this flag and
+ * the following test will be removed.
+ */
+ retval = -EIO;
+ if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+ goto out;
+
audit_ptrace(task);
retval = -EPERM;
@@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
+ if (seize)
+ task->ptrace |= PT_SEIZED;
if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
- send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
spin_lock(&task->sighand->siglock);
/*
- * If the task is already STOPPED, set GROUP_STOP_PENDING and
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
* TRAPPING, and kick it so that it transits to TRACED. TRAPPING
* will be cleared if the child completes the transition or any
* event which clears the group stop states happens. We'll wait
@@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task)
* The following task_is_stopped() test is safe as both transitions
* in and out of STOPPED are protected by siglock.
*/
- if (task_is_stopped(task)) {
- task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
signal_wake_up(task, 1);
- wait_trap = true;
- }
spin_unlock(&task->sighand->siglock);
@@ -257,9 +306,12 @@ unlock_tasklist:
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
- if (wait_trap)
- wait_event(current->signal->wait_chldexit,
- !(task->group_stop & GROUP_STOP_TRAPPING));
+ if (!retval) {
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+ ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
+ }
+
return retval;
}
@@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
+ bool dead;
+
__ptrace_unlink(p);
- if (p->exit_state == EXIT_ZOMBIE) {
- if (!task_detached(p) && thread_group_empty(p)) {
- if (!same_thread_group(p->real_parent, tracer))
- do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
- __wake_up_parent(p, tracer);
- p->exit_signal = -1;
- }
- }
- if (task_detached(p)) {
- /* Mark it as in the process of being reaped. */
- p->exit_state = EXIT_DEAD;
- return true;
+ if (p->exit_state != EXIT_ZOMBIE)
+ return false;
+
+ dead = !thread_group_leader(p);
+
+ if (!dead && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, tracer))
+ dead = do_notify_parent(p, p->exit_signal);
+ else if (ignoring_children(tracer->sighand)) {
+ __wake_up_parent(p, tracer);
+ dead = true;
}
}
-
- return false;
+ /* Mark it as in the process of being reaped. */
+ if (dead)
+ p->exit_state = EXIT_DEAD;
+ return dead;
}
static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
}
write_unlock_irq(&tasklist_lock);
+ proc_ptrace_connector(child, PTRACE_DETACH);
if (unlikely(dead))
release_task(child);
@@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
+ bool seized = child->ptrace & PT_SEIZED;
int ret = -EIO;
- siginfo_t siginfo;
+ siginfo_t siginfo, *si;
void __user *datavp = (void __user *) data;
unsigned long __user *datalp = datavp;
+ unsigned long flags;
switch (request) {
case PTRACE_PEEKTEXT:
@@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
+ case PTRACE_INTERRUPT:
+ /*
+ * Stop tracee without any side-effect on signal or job
+ * control. At least one trap is guaranteed to happen
+ * after this request. If @child is already trapped, the
+ * current trap is not disturbed and another trap will
+ * happen after the current trap is ended with PTRACE_CONT.
+ *
+ * The actual trap might not be PTRACE_EVENT_STOP trap but
+ * the pending condition is cleared regardless.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ /*
+ * INTERRUPT doesn't disturb existing trap sans one
+ * exception. If ptracer issued LISTEN for the current
+ * STOP, this INTERRUPT should clear LISTEN and re-trap
+ * tracee into STOP.
+ */
+ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+ signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
+ case PTRACE_LISTEN:
+ /*
+ * Listen for events. Tracee must be in STOP. It's not
+ * resumed per-se but is not considered to be in TRACED by
+ * wait(2) or ptrace(2). If an async event (e.g. group
+ * stop state change) happens, tracee will enter STOP trap
+ * again. Alternatively, ptracer can issue INTERRUPT to
+ * finish listening and re-trap tracee into STOP.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ si = child->last_siginfo;
+ if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+ break;
+
+ child->jobctl |= JOBCTL_LISTENING;
+
+ /*
+ * If NOTIFY is set, it means event happened between start
+ * of this trap and now. Trigger re-trap immediately.
+ */
+ if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+ signal_wake_up(child, true);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
case PTRACE_DETACH: /* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
@@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (ret < 0)
goto out_put_task_struct;
@@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (!ret)
ret = compat_arch_ptrace(child, request, addr, data);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7784bd216b6a..ddddb320be61 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e138db03382..98f51b13bb7e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
idx = cur_ops->readlock();
completed = cur_ops->completed();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_held() ||
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
idx = cur_ops->readlock();
completed = cur_ops->completed();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_held() ||
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
srcu_read_lock_held(&srcu_ctl));
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7e59ffb3d0ba..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,9 +84,32 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned. So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier(). When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods. This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+/*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks. So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one. We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+
#ifdef CONFIG_RCU_BOOST
/*
@@ -98,7 +121,6 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-static char rcu_kthreads_spawnable;
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1467,6 +1489,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
+ if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ return;
if (likely(!rsp->boost)) {
rcu_do_batch(rsp, rdp);
return;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 14dc7dd00902..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
/*
@@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu)
struct rcu_data *rdp;
struct rcu_node *rnp;
- if (t->rcu_read_lock_nesting &&
+ if (t->rcu_read_lock_nesting > 0 &&
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
@@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu)
rnp->gp_tasks = &t->rcu_node_entry;
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else if (t->rcu_read_lock_nesting < 0 &&
+ t->rcu_read_unlock_special) {
+
+ /*
+ * Complete exit from RCU read-side critical section on
+ * behalf of preempted instance of __rcu_read_unlock().
+ */
+ rcu_read_unlock_special(t);
}
/*
@@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
@@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block. */
- if (in_irq()) {
+ if (in_irq() || in_serving_softirq()) {
local_irq_restore(flags);
return;
}
@@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
+ /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+ if (t->rcu_boosted) {
+ special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 0;
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
t->rcu_blocked_node = NULL;
@@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
if (special & RCU_READ_UNLOCK_BOOSTED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
rt_mutex_unlock(t->rcu_boost_mutex);
t->rcu_boost_mutex = NULL;
}
@@ -387,13 +400,22 @@ void __rcu_read_unlock(void)
struct task_struct *t = current;
barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
- --t->rcu_read_lock_nesting;
- barrier(); /* decrement before load of ->rcu_read_unlock_special */
- if (t->rcu_read_lock_nesting == 0 &&
- unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
+ if (t->rcu_read_lock_nesting != 1)
+ --t->rcu_read_lock_nesting;
+ else {
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
#endif /* #ifdef CONFIG_PROVE_LOCKING */
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu)
rcu_preempt_qs(cpu);
return;
}
- if (per_cpu(rcu_preempt_data, cpu).qs_pending)
+ if (t->rcu_read_lock_nesting > 0 &&
+ per_cpu(rcu_preempt_data, cpu).qs_pending)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
@@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock_irqsave(&rnp->lock, flags);
for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp))
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
break;
+ }
if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
wake_up(&sync_rcu_preempt_exp_wq);
break;
}
@@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock(&rnp->lock); /* irqs already disabled */
rnp->expmask &= ~mask;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp)
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 1;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1532,7 +1557,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
struct sched_param sp;
struct task_struct *t;
- if (!rcu_kthreads_spawnable ||
+ if (!rcu_scheduler_fully_active ||
per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
return 0;
t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
@@ -1639,7 +1664,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
struct sched_param sp;
struct task_struct *t;
- if (!rcu_kthreads_spawnable ||
+ if (!rcu_scheduler_fully_active ||
rnp->qsmaskinit == 0)
return 0;
if (rnp->node_kthread_task == NULL) {
@@ -1665,7 +1690,7 @@ static int __init rcu_spawn_kthreads(void)
int cpu;
struct rcu_node *rnp;
- rcu_kthreads_spawnable = 1;
+ rcu_scheduler_fully_active = 1;
for_each_possible_cpu(cpu) {
per_cpu(rcu_cpu_has_work, cpu) = 0;
if (cpu_online(cpu))
@@ -1687,7 +1712,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_kthreads_spawnable) {
+ if (rcu_scheduler_fully_active) {
(void)rcu_spawn_one_cpu_kthread(cpu);
if (rnp->node_kthread_task == NULL)
(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
@@ -1726,6 +1751,13 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
{
}
+static int __init rcu_scheduler_really_started(void)
+{
+ rcu_scheduler_fully_active = 1;
+ return 0;
+}
+early_initcall(rcu_scheduler_really_started);
+
static void __cpuinit rcu_prepare_kthreads(int cpu)
{
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4e144876dc68..3b0c0986afc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3b3cedc52592 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+ resource_size_t min, max, align;
+ resource_size_t (*alignf)(void *, const struct resource *,
+ resource_size_t, resource_size_t);
+ void *alignf_data;
+};
+
static DEFINE_RWLOCK(resource_lock);
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
}
/*
- * Find empty slot in the resource tree given range and alignment.
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
*/
-static int find_resource(struct resource *root, struct resource *new,
- resource_size_t size, resource_size_t min,
- resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
- void *alignf_data)
+static int __find_resource(struct resource *root, struct resource *old,
+ struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
{
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
* Skip past an allocated resource that starts at 0, since the assignment
* of this->start - 1 to tmp->end below would cause an underflow.
*/
- if (this && this->start == 0) {
- tmp.start = this->end + 1;
+ if (this && this->start == root->start) {
+ tmp.start = (this == old) ? old->start : this->end + 1;
this = this->sibling;
}
for(;;) {
if (this)
- tmp.end = this->start - 1;
+ tmp.end = (this == old) ? this->end : this->start - 1;
else
tmp.end = root->end;
- resource_clip(&tmp, min, max);
+ resource_clip(&tmp, constraint->min, constraint->max);
arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
avail = *new;
- avail.start = ALIGN(tmp.start, align);
+ avail.start = ALIGN(tmp.start, constraint->align);
avail.end = tmp.end;
if (avail.start >= tmp.start) {
- alloc.start = alignf(alignf_data, &avail, size, align);
+ alloc.start = constraint->alignf(constraint->alignf_data, &avail,
+ size, constraint->align);
alloc.end = alloc.start + size - 1;
if (resource_contains(&avail, &alloc)) {
new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
}
if (!this)
break;
- tmp.start = this->end + 1;
+ if (this != old)
+ tmp.start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
}
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
+{
+ return __find_resource(root, NULL, new, size, constraint);
+}
+
/**
- * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ * The resource will be relocated if the new size cannot be reallocated in the
+ * current location.
+ *
+ * @root: root resource descriptor
+ * @old: resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+ resource_size_t newsize,
+ struct resource_constraint *constraint)
+{
+ int err=0;
+ struct resource new = *old;
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+
+ if ((err = __find_resource(root, old, &new, newsize, constraint)))
+ goto out;
+
+ if (resource_contains(&new, old)) {
+ old->start = new.start;
+ old->end = new.end;
+ goto out;
+ }
+
+ if (old->child) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (resource_contains(old, &new)) {
+ old->start = new.start;
+ old->end = new.end;
+ } else {
+ __release_resource(old);
+ *old = new;
+ conflict = __request_resource(root, old);
+ BUG_ON(conflict);
+ }
+out:
+ write_unlock(&resource_lock);
+ return err;
+}
+
+
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ * The resource will be reallocated with a new size if it was already allocated
* @root: root resource descriptor
* @new: resource descriptor desired by caller
* @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
void *alignf_data)
{
int err;
+ struct resource_constraint constraint;
if (!alignf)
alignf = simple_align_resource;
+ constraint.min = min;
+ constraint.max = max;
+ constraint.align = align;
+ constraint.alignf = alignf;
+ constraint.alignf_data = alignf_data;
+
+ if ( new->parent ) {
+ /* resource is already allocated, try reallocating with
+ the new constraints */
+ return reallocate_resource(root, new, size, &constraint);
+ }
+
write_lock(&resource_lock);
- err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ err = find_resource(root, new, size, &constraint);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
@@ -473,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new,
EXPORT_SYMBOL(allocate_resource);
+/**
+ * lookup_resource - find an existing resource by a resource start address
+ * @root: root resource descriptor
+ * @start: resource start address
+ *
+ * Returns a pointer to the resource if found, NULL otherwise
+ */
+struct resource *lookup_resource(struct resource *root, resource_size_t start)
+{
+ struct resource *res;
+
+ read_lock(&resource_lock);
+ for (res = root->child; res; res = res->sibling) {
+ if (res->start == start)
+ break;
+ }
+ read_unlock(&resource_lock);
+
+ return res;
+}
+
/*
* Insert a resource into the resource tree. If successful, return NULL,
* otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
+ plist_head_init(&lock->wait_list);
debug_rt_mutex_init(lock, name);
}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..9f48f3d82e9b 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
#include <linux/rwsem.h>
#include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
/*
* lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
-void down_read_non_owner(struct rw_semaphore *sem)
-{
- might_sleep();
-
- __down_read(sem);
-}
-
-EXPORT_SYMBOL(down_read_non_owner);
-
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
-void up_read_non_owner(struct rw_semaphore *sem)
-{
- __up_read(sem);
-}
-
-EXPORT_SYMBOL(up_read_non_owner);
-
#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index d08d110b8976..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched_cpupri.h"
#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
return 1;
return 0;
}
@@ -292,8 +295,8 @@ static DEFINE_SPINLOCK(task_group_lock);
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES 2
-#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
@@ -422,6 +425,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
* one runnable RT task.
*/
cpumask_var_t rto_mask;
- atomic_t rto_count;
struct cpupri cpupri;
};
@@ -528,6 +531,12 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
/* calc_load related fields */
unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return rq->avg_load_per_task;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
#ifdef CONFIG_PREEMPT
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
{
- s64 irq_delta;
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
@@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_branch((&paravirt_steal_rq_enabled))) {
+ u64 st;
+
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ st = steal_ticks(steal);
+ steal = st * TICK_NSEC;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
rq->clock_task += delta;
- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void)
#define sched_clock_irqtime (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
- rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
- if (unlikely(rq->idle_stamp)) {
+ if (rq->idle_stamp) {
u64 delta = rq->clock - rq->idle_stamp;
u64 max = 2*sysctl_sched_migration_cost;
@@ -2544,13 +2556,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+static void sched_ttwu_do_pending(struct task_struct *list)
{
struct rq *rq = this_rq();
- struct task_struct *list = xchg(&rq->wake_list, NULL);
-
- if (!list)
- return;
raw_spin_lock(&rq->lock);
@@ -2563,9 +2571,45 @@ static void sched_ttwu_pending(void)
raw_spin_unlock(&rq->lock);
}
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ sched_ttwu_do_pending(list);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
void scheduler_ipi(void)
{
- sched_ttwu_pending();
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_do_pending(list);
+ irq_exit();
}
static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -2854,7 +2898,7 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
@@ -3845,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
cpustat->idle = cputime64_add(cpustat->idle, cputime64);
}
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_branch(&paravirt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ account_steal_time(st);
+ return st;
+ }
+#endif
+ return false;
+}
+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3876,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ if (steal_account_process_tick())
+ return;
+
if (irqtime_account_hi_update()) {
cpustat->irq = cputime64_add(cpustat->irq, tmp);
} else if (irqtime_account_si_update()) {
@@ -3929,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
+ if (steal_account_process_tick())
+ return;
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4306,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
- bool ret = false;
-
- rcu_read_lock();
if (lock->owner != owner)
- goto fail;
+ return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4320,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
*/
barrier();
- ret = owner->on_cpu;
-fail:
- rcu_read_unlock();
-
- return ret;
+ return owner->on_cpu;
}
/*
@@ -4336,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
if (!sched_feat(OWNER_SPIN))
return 0;
+ rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
- return 0;
+ break;
arch_mutex_cpu_relax();
}
+ rcu_read_unlock();
/*
- * If the owner changed to another task there is likely
- * heavy contention, stop spinning.
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
*/
- if (lock->owner)
- return 0;
-
- return 1;
+ return lock->owner == NULL;
}
#endif
@@ -6557,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->cpu_power) {
+ if (!group->sgp->power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -6581,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_POWER_SCALE) {
+ if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
- group->cpu_power);
+ group->sgp->power);
}
group = group->next;
@@ -6774,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ kfree(sg->sgp);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
static void free_sched_domain(struct rcu_head *rcu)
{
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- if (atomic_dec_and_test(&sd->groups->ref))
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgp);
kfree(sd->groups);
+ }
kfree(sd);
}
@@ -6945,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
};
struct s_data {
@@ -6964,15 +7055,73 @@ struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP 0x01
+
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int flags;
struct sd_data data;
};
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *child;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+
+ child = *per_cpu_ptr(sdd->sd, i);
+ if (child->child) {
+ child = child->child;
+ cpumask_copy(sg_span, sched_domain_span(child));
+ } else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ atomic_inc(&sg->sgp->ref);
+
+ if (cpumask_test_cpu(cpu, sg_span))
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6981,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg)
+ if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ }
return cpu;
}
/*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
*/
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
@@ -7006,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd)
struct cpumask *covered;
int i;
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(sched_domain_span(sd)))
+ return 0;
+
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
@@ -7020,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd)
continue;
cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
+ sg->sgp->power = 0;
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -7037,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd)
last = sg;
}
last->next = first;
+
+ return 0;
}
/*
@@ -7051,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd)
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- WARN_ON(!sd || !sd->groups);
+ struct sched_group *sg = sd->groups;
- if (cpu != group_first_cpu(sd->groups))
- return;
+ WARN_ON(!sd || !sg);
- sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_first_cpu(sg))
+ return;
update_group_power(sd, cpu);
}
@@ -7177,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
static void claim_allocations(int cpu, struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
- struct sched_group *sg = sd->groups;
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (cpu == cpumask_first(sched_group_cpus(sg))) {
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- }
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ *per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
@@ -7210,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = {
#endif
{ sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
+ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
{ sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
{ NULL, },
@@ -7234,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
+ sdd->sgp = alloc_percpu(struct sched_group_power *);
+ if (!sdd->sgp)
+ return -ENOMEM;
+
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
+ struct sched_group_power *sgp;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
return -ENOMEM;
*per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgp = kzalloc_node(sizeof(struct sched_group_power),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgp)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgp, j) = sgp;
}
}
@@ -7266,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- kfree(*per_cpu_ptr(sdd->sd, j));
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sg, j));
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
free_percpu(sdd->sg);
+ free_percpu(sdd->sgp);
}
}
@@ -7316,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++)
+ for (tl = sched_domain_topology; tl->init; tl++) {
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
while (sd->child)
sd = sd->child;
@@ -7329,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- get_group(i, sd->private, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (i != cpumask_first(sched_domain_span(sd)))
- continue;
-
- build_sched_groups(sd);
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
}
}
@@ -7745,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->rq = rq;
- /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
- cfs_rq->load_stamp = 1;
-#endif
-#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7772,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks);
#endif
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
- rt_rq->rt_nr_boosted = 0;
- rt_rq->rq = rq;
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7801,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
- tg->cfs_rq[cpu] = cfs_rq;
- init_cfs_rq(cfs_rq, rq);
+
cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
+#endif
+ tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
+
/* se could be NULL for root_task_group */
if (!se)
return;
@@ -7828,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- tg->rt_rq[cpu] = rt_rq;
- init_rt_rq(rt_rq, rq);
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
rt_rq->tg = tg;
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ tg->rt_rq[cpu] = rt_rq;
tg->rt_se[cpu] = rt_se;
+
if (!rt_se)
return;
@@ -7915,7 +8093,7 @@ void __init sched_init(void)
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
- init_cfs_rq(&rq->cfs, rq);
+ init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = root_task_group_load;
@@ -7986,7 +8164,7 @@ void __init sched_init(void)
#endif
#ifdef CONFIG_RT_MUTEXES
- plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+ plist_head_init(&init_task.pi_waiters);
#endif
/*
@@ -8029,7 +8207,7 @@ void __init sched_init(void)
scheduler_running = 1;
}
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8039,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
-#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8061,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
-#endif
}
EXPORT_SYMBOL(__might_sleep);
#endif
@@ -8220,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
+ init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
@@ -8247,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
@@ -8268,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
{
int i;
- destroy_rt_bandwidth(&tg->rt_bandwidth);
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
for_each_possible_cpu(i) {
if (tg->rt_rq)
@@ -8309,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
+ init_rt_rq(rt_rq, cpu_rq(i));
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
@@ -8450,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (!tg->se[0])
return -EINVAL;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares)
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
int nice;
};
+static inline bool task_group_is_autogroup(struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
- return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
- return &cpu_rq(this_cpu)->cfs;
-}
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
-static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return se->vruntime - cfs_rq->min_vruntime;
-}
-
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
/*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* We dont care about collisions. Nodes with
* the same key stay together.
*/
- if (key < entity_key(cfs_rq, entry)) {
+ if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));
+
+ /* avoid re-evaluating load for this entity */
+ se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
}
for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* effect of the currently running task from the load
* of the current CPU:
*/
- rcu_read_lock();
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
balanced = this_eff_load <= prev_eff_load;
} else
balanced = true;
- rcu_read_unlock();
/*
* If the currently running task will sleep within
@@ -1585,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
if (local_group) {
this_load = avg_load;
@@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (!sched_feat(WAKEUP_PREEMPT))
return;
- update_curr(cfs_rq);
find_matching_se(&se, &pse);
+ update_curr(cfs_rq_of(se));
BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1) {
/*
@@ -2231,11 +2213,43 @@ static void update_shares(int cpu)
struct rq *rq = cpu_rq(cpu);
rcu_read_lock();
+ /*
+ * Iterates the task_group tree in a bottom up fashion, see
+ * list_add_leaf_cfs_rq() for details.
+ */
for_each_leaf_cfs_rq(rq, cfs_rq)
update_shares_cpu(cfs_rq->tg, cpu);
rcu_read_unlock();
}
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+ unsigned long load;
+ long cpu = (long)data;
+
+ if (!tg->parent) {
+ load = cpu_rq(cpu)->load.weight;
+ } else {
+ load = tg->parent->cfs_rq[cpu]->h_load;
+ load *= tg->se[cpu]->load.weight;
+ load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+ }
+
+ tg->cfs_rq[cpu]->h_load = load;
+
+ return 0;
+}
+
+static void update_h_load(long cpu)
+{
+ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
int *all_pinned)
{
long rem_load_move = max_load_move;
- int busiest_cpu = cpu_of(busiest);
- struct task_group *tg;
+ struct cfs_rq *busiest_cfs_rq;
rcu_read_lock();
- update_h_load(busiest_cpu);
+ update_h_load(cpu_of(busiest));
- list_for_each_entry_rcu(tg, &task_groups, list) {
- struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+ for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
u64 rem_load, moved_load;
@@ -2631,7 +2643,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_POWER_SHIFT;
}
- sdg->cpu_power_orig = power;
+ sdg->sgp->power_orig = power;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2659,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power = 1;
cpu_rq(cpu)->cpu_power = power;
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power += group->cpu_power;
+ power += group->sgp->power;
group = group->next;
} while (group != child->groups);
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
/*
@@ -2691,7 +2703,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/*
* If ~90% of the cpu_power is still there, we're good.
*/
- if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ if (group->sgp->power * 32 > group->sgp->power_orig * 29)
return 1;
return 0;
@@ -2771,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
/*
* Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2800,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += sg->cpu_power;
+ sds->total_pwr += sg->sgp->power;
/*
* In case the child domain prefers tasks go to siblings
@@ -2962,7 +2974,7 @@ static int check_asym_packing(struct sched_domain *sd,
if (this_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
SCHED_POWER_SCALE);
return 1;
}
@@ -2993,7 +3005,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->cpu_power;
+ scaled_busy_load_per_task /= sds->busiest->sgp->power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3019,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
- pwr_now += sds->busiest->cpu_power *
+ pwr_now += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->cpu_power *
+ pwr_now += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->cpu_power;
+ sds->busiest->sgp->power;
if (sds->max_load > tmp)
- pwr_move += sds->busiest->cpu_power *
+ pwr_move += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->cpu_power <
+ if (sds->max_load * sds->busiest->sgp->power <
sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->cpu_power) /
- sds->this->cpu_power;
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
else
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->cpu_power;
- pwr_move += sds->this->cpu_power *
+ sds->this->sgp->power;
+ pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
@@ -3074,7 +3086,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= sds->busiest->cpu_power;
+ load_above_capacity /= sds->busiest->sgp->power;
}
/*
@@ -3090,8 +3102,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->cpu_power,
- (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+ *imbalance = min(max_pull * sds->busiest->sgp->power,
+ (sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,12 +61,14 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(OWNER_SPIN, 1)
/*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
*/
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d018212bab..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
typedef struct task_group *rt_rq_iter_t;
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
- (&iter->list != &task_groups) && \
- (rt_rq = iter->rt_rq[cpu_of(rq)]); \
- iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ do {
+ tg = list_entry_rcu(tg->list.next,
+ typeof(struct task_group), list);
+ } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+
+ if (&tg->list == &task_groups)
+ tg = NULL;
+
+ return tg;
+}
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for (iter = container_of(&task_groups, typeof(*iter), list); \
+ (iter = next_task_group(iter)) && \
+ (rt_rq = iter->rt_rq[cpu_of(rq)]);)
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
@@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
rt_rq = &rq->rt;
- if (unlikely(!rt_rq->rt_nr_running))
+ if (!rt_rq->rt_nr_running)
return NULL;
if (rt_rq_throttled(rt_rq))
@@ -1548,7 +1560,7 @@ skip:
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+ if (rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index ff7678603328..291c9700be75 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
/*
* Tracers may want to know about even ignored signals.
*/
- return !tracehook_consider_ignored_signal(t, sig);
+ return !t->ptrace;
}
/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
static int recalc_sigpending_tsk(struct task_struct *t)
{
- if ((t->group_stop & GROUP_STOP_PENDING) ||
+ if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (unlikely(tracehook_force_sigpending()))
- set_thread_flag(TIF_SIGPENDING);
- else if (!recalc_sigpending_tsk(current) && !freezing(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
}
/**
- * task_clear_group_stop_trapping - clear group stop trapping bit
+ * task_set_jobctl_pending - set jobctl pending bits
* @task: target task
+ * @mask: pending bits to set
*
- * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
- * and wake up the ptracer. Note that we don't need any further locking.
- * @task->siglock guarantees that @task->parent points to the ptracer.
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
+ * cleared. If @task is already being killed or exiting, this function
+ * becomes noop.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
*/
-static void task_clear_group_stop_trapping(struct task_struct *task)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
{
- if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
- task->group_stop &= ~GROUP_STOP_TRAPPING;
- __wake_up_sync_key(&task->parent->signal->wait_chldexit,
- TASK_UNINTERRUPTIBLE, 1, task);
+ BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+ JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+ BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+ if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ return false;
+
+ if (mask & JOBCTL_STOP_SIGMASK)
+ task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+ task->jobctl |= mask;
+ return true;
+}
+
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer. Note that we don't need any further
+ * locking. @task->siglock guarantees that @task->parent points to the
+ * ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_trapping(struct task_struct *task)
+{
+ if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_TRAPPING;
+ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
}
}
/**
- * task_clear_group_stop_pending - clear pending group stop
+ * task_clear_jobctl_pending - clear jobctl pending bits
* @task: target task
+ * @mask: pending bits to clear
+ *
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
*
- * Clear group stop states for @task.
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-void task_clear_group_stop_pending(struct task_struct *task)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
{
- task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
- GROUP_STOP_DEQUEUED);
+ BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+ if (mask & JOBCTL_STOP_PENDING)
+ mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+ task->jobctl &= ~mask;
+
+ if (!(task->jobctl & JOBCTL_PENDING_MASK))
+ task_clear_jobctl_trapping(task);
}
/**
* task_participate_group_stop - participate in a group stop
* @task: task participating in a group stop
*
- * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
* Group stop states are cleared and the group stop count is consumed if
- * %GROUP_STOP_CONSUME was set. If the consumption completes the group
+ * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
* stop, the appropriate %SIGNAL_* flags are set.
*
* CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
static bool task_participate_group_stop(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
- bool consume = task->group_stop & GROUP_STOP_CONSUME;
+ bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
- WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+ WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
- task_clear_group_stop_pending(task);
+ task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
if (!consume)
return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
- return !tracehook_consider_fatal_signal(tsk, sig);
+ /* if ptraced, let the tracer determine */
+ return !tsk->ptrace;
}
/*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- current->group_stop |= GROUP_STOP_DEQUEUED;
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
return security_task_kill(t, info, sig, 0);
}
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event. @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken. If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event. If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+ WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+ assert_spin_locked(&t->sighand->siglock);
+
+ task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+ signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
+
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
- wake_up_state(t, __TASK_STOPPED);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ wake_up_state(t, __TASK_STOPPED);
+ else
+ ptrace_trap_notify(t);
} while_each_thread(p, t);
/*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL ||
- !tracehook_consider_fatal_signal(t, sig))) {
+ (sig == SIGKILL || !t->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
signal->group_stop_count = 0;
t = p;
do {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
count++;
/* Don't bother with already dead threads */
@@ -1178,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
{
struct sighand_struct *sighand;
- rcu_read_lock();
for (;;) {
+ local_irq_save(*flags);
+ rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
- if (unlikely(sighand == NULL))
+ if (unlikely(sighand == NULL)) {
+ rcu_read_unlock();
+ local_irq_restore(*flags);
break;
+ }
- spin_lock_irqsave(&sighand->siglock, *flags);
- if (likely(sighand == tsk->sighand))
+ spin_lock(&sighand->siglock);
+ if (likely(sighand == tsk->sighand)) {
+ rcu_read_unlock();
break;
- spin_unlock_irqrestore(&sighand->siglock, *flags);
+ }
+ spin_unlock(&sighand->siglock);
+ rcu_read_unlock();
+ local_irq_restore(*flags);
}
- rcu_read_unlock();
return sighand;
}
@@ -1504,22 +1584,22 @@ ret:
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
*/
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
- int ret = sig;
+ bool autoreap = false;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
- BUG_ON(!task_ptrace(tsk) &&
+ BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
@@ -1558,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
- if (!task_ptrace(tsk) && sig == SIGCHLD &&
+ if (!tsk->ptrace && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
@@ -1576,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
- ret = tsk->exit_signal = -1;
+ autoreap = true;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
- sig = -1;
+ sig = 0;
}
- if (valid_signal(sig) && sig > 0)
+ if (valid_signal(sig) && sig)
__group_send_sig_info(sig, &info, tsk->parent);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
- return ret;
+ return autoreap;
}
/**
@@ -1658,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
static inline int may_ptrace_stop(void)
{
- if (!likely(task_ptrace(current)))
+ if (!likely(current->ptrace))
return 0;
/*
* Are we in the middle of do_coredump?
@@ -1687,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
}
/*
- * Test whether the target task of the usual cldstop notification - the
- * real_parent of @child - is in the same group as the ptracer.
- */
-static bool real_parent_is_ptracer(struct task_struct *child)
-{
- return same_thread_group(child->parent, child->real_parent);
-}
-
-/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
@@ -1732,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
}
/*
- * If @why is CLD_STOPPED, we're trapping to participate in a group
- * stop. Do the bookkeeping. Note that if SIGCONT was delievered
- * while siglock was released for the arch hook, PENDING could be
- * clear now. We act as if SIGCONT is received after TASK_TRACED
- * is entered - ignore it.
+ * We're committing to trapping. TRACED should be visible before
+ * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+ * Also, transition to TRACED and updates to ->jobctl should be
+ * atomic with respect to siglock and should be done after the arch
+ * hook as siglock is released and regrabbed across it.
*/
- if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
- gstop_done = task_participate_group_stop(current);
+ set_current_state(TASK_TRACED);
current->last_siginfo = info;
current->exit_code = exit_code;
/*
- * TRACED should be visible before TRAPPING is cleared; otherwise,
- * the tracer might fail do_wait().
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * across siglock relocks since INTERRUPT was scheduled, PENDING
+ * could be clear now. We act as if SIGCONT is received after
+ * TASK_TRACED is entered - ignore it.
*/
- set_current_state(TASK_TRACED);
+ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
- /*
- * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
- * transition to TASK_TRACED should be atomic with respect to
- * siglock. This hsould be done after the arch hook as siglock is
- * released and regrabbed across it.
- */
- task_clear_group_stop_trapping(current);
+ /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
+
+ /* entering a trap, clear TRAPPING */
+ task_clear_jobctl_trapping(current);
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
@@ -1772,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
* separately unless they're gonna be duplicates.
*/
do_notify_parent_cldstop(current, true, why);
- if (gstop_done && !real_parent_is_ptracer(current))
+ if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
/*
@@ -1792,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
*
* If @gstop_done, the ptracer went away between group stop
* completion and here. During detach, it would have set
- * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
- * in do_signal_stop() on return, so notifying the real
- * parent of the group stop completion is enough.
+ * JOBCTL_STOP_PENDING on us and we'll re-enter
+ * TASK_STOPPED in do_signal_stop() on return, so notifying
+ * the real parent of the group stop completion is enough.
*/
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
@@ -1820,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
spin_lock_irq(&current->sighand->siglock);
current->last_siginfo = NULL;
+ /* LISTENING can be set only during STOP traps, clear it */
+ current->jobctl &= ~JOBCTL_LISTENING;
+
/*
* Queued signals ignored us while we were stopped for tracing.
* So check for any that we should take before resuming user mode.
@@ -1828,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
recalc_sigpending_tsk(current);
}
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
{
siginfo_t info;
- BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-
memset(&info, 0, sizeof info);
- info.si_signo = SIGTRAP;
+ info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
info.si_uid = current_uid();
/* Let the debugger run. */
+ ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+ BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
+ ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
spin_unlock_irq(&current->sighand->siglock);
}
-/*
- * This performs the stopping for SIGSTOP and other stop signals.
- * We have to stop all threads in the thread group.
- * Returns non-zero if we've actually stopped and released the siglock.
- * Returns zero if we didn't stop and still hold the siglock.
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it. If already set, participate in the existing
+ * group stop. If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself. Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched. The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
*/
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+ __releases(&current->sighand->siglock)
{
struct signal_struct *sig = current->signal;
- if (!(current->group_stop & GROUP_STOP_PENDING)) {
- unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
+ if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+ unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
- /* signr will be recorded in task->group_stop for retries */
- WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+ /* signr will be recorded in task->jobctl for retries */
+ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
- if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
+ if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
- return 0;
+ return false;
/*
* There is no group stop already in progress. We must
* initiate one now.
@@ -1888,28 +1987,32 @@ static int do_signal_stop(int signr)
if (!(sig->flags & SIGNAL_STOP_STOPPED))
sig->group_exit_code = signr;
else
- WARN_ON_ONCE(!task_ptrace(current));
+ WARN_ON_ONCE(!current->ptrace);
+
+ sig->group_stop_count = 0;
+
+ if (task_set_jobctl_pending(current, signr | gstop))
+ sig->group_stop_count++;
- current->group_stop &= ~GROUP_STOP_SIGMASK;
- current->group_stop |= signr | gstop;
- sig->group_stop_count = 1;
for (t = next_thread(current); t != current;
t = next_thread(t)) {
- t->group_stop &= ~GROUP_STOP_SIGMASK;
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
- t->group_stop |= signr | gstop;
+ if (!task_is_stopped(t) &&
+ task_set_jobctl_pending(t, signr | gstop)) {
sig->group_stop_count++;
- signal_wake_up(t, 0);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ signal_wake_up(t, 0);
+ else
+ ptrace_trap_notify(t);
}
}
}
-retry:
- if (likely(!task_ptrace(current))) {
+
+ if (likely(!current->ptrace)) {
int notify = 0;
/*
@@ -1940,43 +2043,65 @@ retry:
/* Now we don't run again until woken by SIGCONT or SIGKILL */
schedule();
-
- spin_lock_irq(&current->sighand->siglock);
+ return true;
} else {
- ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
- CLD_STOPPED, 0, NULL);
- current->exit_code = 0;
+ /*
+ * While ptraced, group stop is handled by STOP trap.
+ * Schedule it and let the caller deal with it.
+ */
+ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ return false;
}
+}
- /*
- * GROUP_STOP_PENDING could be set if another group stop has
- * started since being woken up or ptrace wants us to transit
- * between TASK_STOPPED and TRACED. Retry group stop.
- */
- if (current->group_stop & GROUP_STOP_PENDING) {
- WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
- goto retry;
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo. If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+ struct signal_struct *signal = current->signal;
+ int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+
+ if (current->ptrace & PT_SEIZED) {
+ if (!signal->group_stop_count &&
+ !(signal->flags & SIGNAL_STOP_STOPPED))
+ signr = SIGTRAP;
+ WARN_ON_ONCE(!signr);
+ ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+ CLD_STOPPED);
+ } else {
+ WARN_ON_ONCE(!signr);
+ ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
}
-
- /* PTRACE_ATTACH might have raced with task killing, clear trapping */
- task_clear_group_stop_trapping(current);
-
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_finish_jctl();
-
- return 1;
}
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
- if (!task_ptrace(current))
- return signr;
-
ptrace_signal_deliver(regs, cookie);
-
- /* Let the debugger run. */
+ /*
+ * We do not check sig_kernel_stop(signr) but set this marker
+ * unconditionally because we do not know whether debugger will
+ * change signr. This flag has no meaning unless we are going
+ * to stop after return from ptrace_stop(). In this case it will
+ * be checked in do_signal_stop(), we should only stop if it was
+ * not cleared by SIGCONT while we were sleeping. See also the
+ * comment in dequeue_signal().
+ */
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
@@ -2032,7 +2157,6 @@ relock:
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
- struct task_struct *leader;
int why;
if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2053,13 +2177,11 @@ relock:
* a duplicate.
*/
read_lock(&tasklist_lock);
-
do_notify_parent_cldstop(current, false, why);
- leader = current->group_leader;
- if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
- do_notify_parent_cldstop(leader, true, why);
-
+ if (ptrace_reparented(current->group_leader))
+ do_notify_parent_cldstop(current->group_leader,
+ true, why);
read_unlock(&tasklist_lock);
goto relock;
@@ -2067,37 +2189,31 @@ relock:
for (;;) {
struct k_sigaction *ka;
- /*
- * Tracing can induce an artificial signal and choose sigaction.
- * The return value in @signr determines the default action,
- * but @info->si_signo is the signal number we will report.
- */
- signr = tracehook_get_signal(current, regs, info, return_ka);
- if (unlikely(signr < 0))
+
+ if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+ do_signal_stop(0))
goto relock;
- if (unlikely(signr != 0))
- ka = return_ka;
- else {
- if (unlikely(current->group_stop &
- GROUP_STOP_PENDING) && do_signal_stop(0))
- goto relock;
- signr = dequeue_signal(current, &current->blocked,
- info);
+ if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ goto relock;
+ }
- if (!signr)
- break; /* will return 0 */
+ signr = dequeue_signal(current, &current->blocked, info);
- if (signr != SIGKILL) {
- signr = ptrace_signal(signr, info,
- regs, cookie);
- if (!signr)
- continue;
- }
+ if (!signr)
+ break; /* will return 0 */
- ka = &sighand->action[signr-1];
+ if (unlikely(current->ptrace) && signr != SIGKILL) {
+ signr = ptrace_signal(signr, info,
+ regs, cookie);
+ if (!signr)
+ continue;
}
+ ka = &sighand->action[signr-1];
+
/* Trace actually delivered signals. */
trace_signal_deliver(signr, info, ka);
@@ -2253,7 +2369,7 @@ void exit_signals(struct task_struct *tsk)
signotset(&unblocked);
retarget_shared_pending(tsk, &unblocked);
- if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+ if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
task_participate_group_stop(tsk))
group_stop = CLD_STOPPED;
out:
@@ -2986,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask)
SYSCALL_DEFINE1(ssetmask, int, newmask)
{
- int old;
-
- spin_lock_irq(&current->sighand->siglock);
- old = current->blocked.sig[0];
+ int old = current->blocked.sig[0];
+ sigset_t newset;
- siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
- sigmask(SIGSTOP)));
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
+ set_current_blocked(&newset);
return old;
}
@@ -3051,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
return -EFAULT;
sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
- spin_lock_irq(&current->sighand->siglock);
current->saved_sigmask = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&newset);
current->state = TASK_INTERRUPTIBLE;
schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 40cf63ddd4b3..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
{
if (!force_irqthreads)
__do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#else
static inline void invoke_softirq(void)
{
if (!force_irqthreads)
do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..ba5070ce5765 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
/*
* Structure to determine completion condition and record errors. May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
static DEFINE_MUTEX(stop_cpus_mutex);
static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
-int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static void queue_stop_cpus_work(const struct cpumask *cpumask,
+ cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_done *done)
{
struct cpu_stop_work *work;
- struct cpu_stop_done done;
unsigned int cpu;
/* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
work = &per_cpu(stop_cpus_work, cpu);
work->fn = fn;
work->arg = arg;
- work->done = &done;
+ work->done = done;
}
- cpu_stop_init_done(&done, cpumask_weight(cpumask));
/*
* Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
&per_cpu(stop_cpus_work, cpu));
preempt_enable();
+}
+static int __stop_cpus(const struct cpumask *cpumask,
+ cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+
+ cpu_stop_init_done(&done, cpumask_weight(cpumask));
+ queue_stop_cpus_work(cpumask, fn, arg, &done);
wait_for_completion(&done.completion);
return done.executed ? done.ret : -ENOENT;
}
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
int cpu = smp_processor_id(), err = 0;
+ unsigned long flags;
bool is_active;
+ /*
+ * When called from stop_machine_from_inactive_cpu(), irq might
+ * already be disabled. Save the state and restore it on exit.
+ */
+ local_save_flags(flags);
+
if (!smdata->active_cpus)
is_active = cpu == cpumask_first(cpu_online_mask);
else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
}
} while (curstate != STOPMACHINE_EXIT);
- local_irq_enable();
+ local_irq_restore(flags);
return err;
}
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
}
EXPORT_SYMBOL_GPL(stop_machine);
+/**
+ * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * This is identical to stop_machine() but can be called from a CPU which
+ * is not active. The local CPU is in the process of hotplug (so no other
+ * CPU hotplug can start) and not marked active and doesn't have enough
+ * context to sleep.
+ *
+ * This function provides stop_machine() functionality for such state by
+ * using busy-wait for synchronization and executing @fn directly for local
+ * CPU.
+ *
+ * CONTEXT:
+ * Local CPU is inactive. Temporarily stops all active CPUs.
+ *
+ * RETURNS:
+ * 0 if all executions of @fn returned 0, any non zero return value if any
+ * returned non zero.
+ */
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+ const struct cpumask *cpus)
+{
+ struct stop_machine_data smdata = { .fn = fn, .data = data,
+ .active_cpus = cpus };
+ struct cpu_stop_done done;
+ int ret;
+
+ /* Local CPU must be inactive and CPU hotplug in progress. */
+ BUG_ON(cpu_active(raw_smp_processor_id()));
+ smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
+
+ /* No proper task established and can't sleep - busy wait for lock. */
+ while (!mutex_trylock(&stop_cpus_mutex))
+ cpu_relax();
+
+ /* Schedule work on other CPUs and execute directly for local CPU */
+ set_state(&smdata, STOPMACHINE_PREPARE);
+ cpu_stop_init_done(&done, num_active_cpus());
+ queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+ &done);
+ ret = stop_machine_cpu_stop(&smdata);
+
+ /* Busy wait for completion. */
+ while (!completion_done(&done.completion))
+ cpu_relax();
+
+ mutex_unlock(&stop_cpus_mutex);
+ return ret ?: done.ret;
+}
+
#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b278f23..a101ba36c444 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
-#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
@@ -320,6 +319,37 @@ void kernel_restart_prepare(char *cmd)
}
/**
+ * register_reboot_notifier - Register function to be called at reboot time
+ * @nb: Info about notifier function to be called
+ *
+ * Registers a function with the list of functions
+ * to be called at reboot time.
+ *
+ * Currently always returns zero, as blocking_notifier_chain_register()
+ * always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ * unregister_reboot_notifier - Unregister previously registered reboot notifier
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered reboot
+ * notifier function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
* or %NULL
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98bd355..11d65b531e50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1590,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head)
spin_unlock(&sysctl_lock);
}
-static void free_head(struct rcu_head *rcu)
-{
- kfree(container_of(rcu, struct ctl_table_header, rcu));
-}
-
void sysctl_head_put(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
if (!--head->count)
- call_rcu(&head->rcu, free_head);
+ kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
}
@@ -1971,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
start_unregistering(header);
if (!--header->parent->count) {
WARN_ON(1);
- call_rcu(&header->parent->rcu, free_head);
+ kfree_rcu(header->parent, rcu);
}
if (!--header->count)
- call_rcu(&header->rcu, free_head);
+ kfree_rcu(header, rcu);
spin_unlock(&sysctl_lock);
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index fc0f22005417..e19ce1454ee1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <net/genetlink.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
/*
* Maximum length of a cpumask that can be specified in
@@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
- s = NULL;
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
- if (!s)
- s = kmalloc_node(sizeof(struct listener),
- GFP_KERNEL, cpu_to_node(cpu));
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
if (!s)
goto cleanup;
+
s->pid = pid;
- INIT_LIST_HEAD(&s->list);
s->valid = 1;
listeners = &per_cpu(listener_array, cpu);
down_write(&listeners->sem);
- list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
- if (s2->pid == pid)
- goto next_cpu;
+ list_for_each_entry(s2, &listeners->list, list) {
+ if (s2->pid == pid && s2->valid)
+ goto exists;
}
list_add(&s->list, &listeners->list);
s = NULL;
-next_cpu:
+exists:
up_write(&listeners->sem);
+ kfree(s); /* nop if NULL */
}
- kfree(s);
return 0;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342408cf68dd..2b021b0e8507 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -604,6 +604,12 @@ static struct timespec timekeeping_suspend_time;
*/
static void __timekeeping_inject_sleeptime(struct timespec *delta)
{
+ if (!timespec_valid(delta)) {
+ printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
+ "sleep delta value!\n");
+ return;
+ }
+
xtime = timespec_add(xtime, *delta);
wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
total_sleep_time = timespec_add(total_sleep_time, *delta);
@@ -686,12 +692,34 @@ static void timekeeping_resume(void)
static int timekeeping_suspend(void)
{
unsigned long flags;
+ struct timespec delta, delta_delta;
+ static struct timespec old_delta;
read_persistent_clock(&timekeeping_suspend_time);
write_seqlock_irqsave(&xtime_lock, flags);
timekeeping_forward_now();
timekeeping_suspended = 1;
+
+ /*
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
+ */
+ delta = timespec_sub(xtime, timekeeping_suspend_time);
+ delta_delta = timespec_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occured and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec_add(timekeeping_suspend_time, delta_delta);
+ }
write_sequnlock_irqrestore(&xtime_lock, flags);
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb4..cd3134510f3d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
power:power_frequency
This is for userspace compatibility
and will vanish after 5 kernel iterations,
- namely 2.6.41.
+ namely 3.1.
config CONTEXT_SWITCH_TRACER
bool
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3f381d0b20a8..616846bcfee5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
#define _LINUX_KERNEL_TRACE_H
#include <linux/fs.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/ring_buffer.h>
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505d..fd3c8aae55e5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
#include <linux/slab.h>
#include <linux/time.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include "trace.h"
#include "trace_output.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d04..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
* per-CPU workqueues:
*/
struct workqueue_struct {
- unsigned int flags; /* I: WQ_* flags */
+ unsigned int flags; /* W: WQ_* flags */
union {
struct cpu_workqueue_struct __percpu *pcpu;
struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
mayday_mask_t mayday_mask; /* cpus requesting rescue */
struct worker *rescuer; /* I: rescue worker */
+ int nr_drainers; /* W: drain in progress */
int saved_max_active; /* W: saved cwq max_active */
const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
/* if dying, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & WQ_DYING) &&
+ if (unlikely(wq->flags & WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
@@ -2381,6 +2382,54 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty. While draining is in progress,
+ * only chain queueing is allowed. IOW, only currently pending or running
+ * work items on @wq can queue further work items on it. @wq is flushed
+ * repeatedly until it becomes empty. The number of flushing is detemined
+ * by the depth of chaining and should be relatively short. Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+ unsigned int flush_cnt = 0;
+ unsigned int cpu;
+
+ /*
+ * __queue_work() needs to test whether there are drainers, is much
+ * hotter than drain_workqueue() and already looks at @wq->flags.
+ * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+ */
+ spin_lock(&workqueue_lock);
+ if (!wq->nr_drainers++)
+ wq->flags |= WQ_DRAINING;
+ spin_unlock(&workqueue_lock);
+reflush:
+ flush_workqueue(wq);
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+ continue;
+
+ if (++flush_cnt == 10 ||
+ (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+ pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+ wq->name, flush_cnt);
+ goto reflush;
+ }
+
+ spin_lock(&workqueue_lock);
+ if (!--wq->nr_drainers)
+ wq->flags &= ~WQ_DRAINING;
+ spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
+
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
bool wait_executing)
{
@@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- unsigned int flush_cnt = 0;
unsigned int cpu;
- /*
- * Mark @wq dying and drain all pending works. Once WQ_DYING is
- * set, only chain queueing is allowed. IOW, only currently
- * pending or running work items on @wq can queue further work
- * items on it. @wq is flushed repeatedly until it becomes empty.
- * The number of flushing is detemined by the depth of chaining and
- * should be relatively short. Whine if it takes too long.
- */
- wq->flags |= WQ_DYING;
-reflush:
- flush_workqueue(wq);
-
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-
- if (!cwq->nr_active && list_empty(&cwq->delayed_works))
- continue;
-
- if (++flush_cnt == 10 ||
- (flush_cnt % 100 == 0 && flush_cnt <= 1000))
- printk(KERN_WARNING "workqueue %s: flush on "
- "destruction isn't complete after %u tries\n",
- wq->name, flush_cnt);
- goto reflush;
- }
+ /* drain it before proceeding with destruction */
+ drain_workqueue(wq);
/*
* wq list is used to freeze wq, remove from list after