summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile104
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit.h18
-rw-r--r--kernel/audit_fsnotify.c216
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c56
-rw-r--r--kernel/auditfilter.c83
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/bpf/arraymap.c137
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/syscall.c14
-rw-r--r--kernel/bpf/verifier.c58
-rw-r--r--kernel/cgroup.c127
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c355
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/events/core.c80
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c100
-rw-r--r--kernel/jump_label.c158
-rw-r--r--kernel/kexec.c2531
-rw-r--r--kernel/kexec_core.c1534
-rw-r--r--kernel/kexec_file.c1045
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/locking/Makefile4
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/locking/qrwlock.c47
-rw-r--r--kernel/locking/qspinlock.c6
-rw-r--r--kernel/locking/qspinlock_paravirt.h102
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/memremap.c190
-rw-r--r--kernel/module_signing.c213
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c17
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c12
-rw-r--r--kernel/trace/blktrace.c10
-rw-r--r--kernel/trace/bpf_trace.c63
-rw-r--r--kernel/trace/ftrace.c9
-rw-r--r--kernel/trace/ring_buffer.c764
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_events.c25
-rw-r--r--kernel/trace/trace_events_filter.c54
-rw-r--r--kernel/trace/trace_functions_graph.c4
-rw-r--r--kernel/trace/trace_kprobe.c20
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_stack.c68
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c189
-rw-r--r--kernel/workqueue.c2
67 files changed, 5076 insertions, 4277 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..d4988410b410 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
@@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
@@ -99,6 +101,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
+
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
@@ -111,99 +115,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
-
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
-
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
-endif
-
-clean-files := x509_certificate_list .x509.list
-
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-
-signing_key.priv signing_key.x509: x509.genkey
- @echo "###"
- @echo "### Now generating an X.509 key pair to be used for signing modules."
- @echo "###"
- @echo "### If this takes a long time, you might wish to run rngd in the"
- @echo "### background to keep the supply of entropy topped up. It"
- @echo "### needs to be run as root, and uses a hardware random"
- @echo "### number generator if one is available."
- @echo "###"
- openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
- -batch -x509 -config x509.genkey \
- -outform DER -out signing_key.x509 \
- -keyout signing_key.priv 2>&1
- @echo "###"
- @echo "### Key pair generated."
- @echo "###"
-
-x509.genkey:
- @echo Generating X.509 key generation config
- @echo >x509.genkey "[ req ]"
- @echo >>x509.genkey "default_bits = 4096"
- @echo >>x509.genkey "distinguished_name = req_distinguished_name"
- @echo >>x509.genkey "prompt = no"
- @echo >>x509.genkey "string_mask = utf8only"
- @echo >>x509.genkey "x509_extensions = myexts"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ req_distinguished_name ]"
- @echo >>x509.genkey "#O = Unspecified company"
- @echo >>x509.genkey "CN = Build time autogenerated kernel key"
- @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ myexts ]"
- @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
- @echo >>x509.genkey "keyUsage=digitalSignature"
- @echo >>x509.genkey "subjectKeyIdentifier=hash"
- @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/audit.c b/kernel/audit.c
index f9e6065346db..662c007635fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
} else
audit_log_format(ab, " name=(null)");
- if (n->ino != (unsigned long)-1)
+ if (n->ino != AUDIT_INO_UNSET)
audit_log_format(ab, " inode=%lu"
" dev=%02x:%02x mode=%#ho"
" ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..dadf86a0e59e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -50,6 +50,7 @@ enum audit_state {
/* Rule lists */
struct audit_watch;
+struct audit_fsnotify_mark;
struct audit_tree;
struct audit_chunk;
@@ -252,6 +253,7 @@ struct audit_net {
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
+extern int audit_del_rule(struct audit_entry *);
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+
#else
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m)
+#define audit_remove_mark_rule(k)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..27c6046c2c3d
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,216 @@
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ char *path; /* insertion path */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+ struct audit_krule *rule;
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+ kfree(audit_mark->path);
+ kfree(audit_mark);
+}
+
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct audit_fsnotify_mark *audit_mark;
+
+ audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+ audit_fsnotify_mark_free(audit_mark);
+}
+
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+ return mark->path;
+}
+
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+ if (mark->ino == AUDIT_INO_UNSET)
+ return 0;
+ return (mark->ino == ino) && (mark->dev == dev);
+}
+
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+ struct inode *inode)
+{
+ audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+ audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct path path;
+ struct dentry *dentry;
+ struct inode *inode;
+ int ret;
+
+ if (pathname[0] != '/' || pathname[len-1] == '/')
+ return ERR_PTR(-EINVAL);
+
+ dentry = kern_path_locked(pathname, &path);
+ if (IS_ERR(dentry))
+ return (void *)dentry; /* returning an error */
+ inode = path.dentry->d_inode;
+ mutex_unlock(&inode->i_mutex);
+
+ audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+ if (unlikely(!audit_mark)) {
+ audit_mark = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+ audit_mark->mark.mask = AUDIT_FS_EVENTS;
+ audit_mark->path = pathname;
+ audit_update_mark(audit_mark, dentry->d_inode);
+ audit_mark->rule = krule;
+
+ ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+ if (ret < 0) {
+ audit_fsnotify_mark_free(audit_mark);
+ audit_mark = ERR_PTR(ret);
+ }
+out:
+ dput(dentry);
+ path_put(&path);
+ return audit_mark;
+}
+
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+ struct audit_buffer *ab;
+ struct audit_krule *rule = audit_mark->rule;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, audit_mark->path);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+ fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+ fsnotify_put_mark(&audit_mark->mark);
+}
+
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+ struct audit_fsnotify_mark *mark = krule->exe;
+
+ audit_remove_mark(mark);
+}
+
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+ struct audit_krule *rule = audit_mark->rule;
+ struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+
+ audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+ audit_del_rule(entry);
+}
+
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct inode *inode = NULL;
+
+ audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+ BUG_ON(group != audit_fsnotify_group);
+
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = ((struct path *)data)->dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ return 0;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+ if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+ return 0;
+ audit_update_mark(audit_mark, inode);
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ audit_autoremove_mark_rule(audit_mark);
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+ .handle_event = audit_mark_handle_event,
+};
+
+static int __init audit_fsnotify_init(void)
+{
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ if (IS_ERR(audit_fsnotify_group)) {
+ audit_fsnotify_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..94ecdabda8e6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
if (rule->tree) {
/* not a half-baked one */
audit_tree_log_remove_rule(rule);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return (watch->ino != (unsigned long)-1) &&
+ return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
INIT_LIST_HEAD(&watch->rules);
atomic_set(&watch->count, 1);
watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
+ watch->dev = AUDIT_DEV_UNSET;
+ watch->ino = AUDIT_INO_UNSET;
return watch;
}
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
if (IS_ERR(watch))
return PTR_ERR(watch);
- audit_get_watch(watch);
krule->watch = watch;
return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
+ if (oentry->rule.exe)
+ audit_remove_mark(oentry->rule.exe);
audit_watch_log_rule_change(r, owatch, "updated_rules");
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
+ if (e->rule.exe)
+ audit_remove_mark(e->rule.exe);
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
watch_found = 1;
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
+ /* put krule's ref to temporary watch */
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
+
+ audit_put_parent(parent);
break;
}
if (!watch_found) {
- audit_get_parent(parent);
watch->parent = parent;
+ audit_get_watch(watch);
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
audit_add_to_parent(krule, parent);
- /* match get in audit_find_parent or audit_init_parent */
- audit_put_parent(parent);
-
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
return 0;
}
device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ char *pathname;
+
+ pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+ if (IS_ERR(audit_mark)) {
+ kfree(pathname);
+ return PTR_ERR(audit_mark);
+ }
+ new->exe = audit_mark;
+
+ return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+ struct file *exe_file;
+ unsigned long ino;
+ dev_t dev;
+
+ rcu_read_lock();
+ exe_file = rcu_dereference(tsk->mm->exe_file);
+ ino = exe_file->f_inode->i_ino;
+ dev = exe_file->f_inode->i_sb->s_dev;
+ rcu_read_unlock();
+ return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
+ case AUDIT_EXE:
+ if (f->op != Audit_equal)
+ return -EINVAL;
+ if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+ return -EINVAL;
+ break;
};
return 0;
}
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
char *str;
+ struct audit_fsnotify_mark *audit_mark;
entry = audit_to_entry_common(data);
if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
+ case AUDIT_EXE:
+ if (entry->rule.exe || f->val > PATH_MAX)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
+ goto exit_free;
+ }
+ entry->rule.buflen += f->val;
+
+ audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ if (IS_ERR(audit_mark)) {
+ kfree(str);
+ err = PTR_ERR(audit_mark);
+ goto exit_free;
+ }
+ entry->rule.exe = audit_mark;
+ break;
}
}
@@ -549,10 +574,10 @@ exit_nofree:
return entry;
exit_free:
- if (entry->rule.watch)
- audit_put_watch(entry->rule.watch); /* matches initial get */
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe); /* that's the template one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_EXE:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, audit_mark_path(krule->exe));
+ break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_EXE:
+ /* both paths exist based on above type compare */
+ if (strcmp(audit_mark_path(a->exe),
+ audit_mark_path(b->exe)))
+ return 1;
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
err = -ENOMEM;
else
new->filterkey = fk;
+ break;
+ case AUDIT_EXE:
+ err = audit_dupe_exe(new, old);
+ break;
}
if (err) {
+ if (new->exe)
+ audit_remove_mark(new->exe);
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int err;
+ int err = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
*/
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
- goto error;
+ return err;
}
}
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- return 0;
-
-error:
- if (watch)
- audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
}
/* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (!e) {
- mutex_unlock(&audit_filter_mutex);
ret = -ENOENT;
goto out;
}
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
- list_del_rcu(&e->list);
- list_del(&e->rule.list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
+ if (e->rule.exe)
+ audit_remove_mark_rule(&e->rule);
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (!audit_match_signal(entry))
audit_signals--;
#endif
- mutex_unlock(&audit_filter_mutex);
+
+ list_del_rcu(&e->list);
+ list_del(&e->rule.list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
out:
- if (watch)
- audit_put_watch(watch); /* match initial get */
+ mutex_unlock(&audit_filter_mutex);
+
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
WARN_ON(1);
}
- if (err || type == AUDIT_DEL_RULE)
+ if (err || type == AUDIT_DEL_RULE) {
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
audit_free_rule(entry);
+ }
return err;
}
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
return 0;
nentry = audit_dupe_rule(r);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd15fed..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
return 0;
list_for_each_entry(n, &ctx->names_list, list) {
- if ((n->ino != -1) &&
+ if ((n->ino != AUDIT_INO_UNSET) &&
((n->mode & S_IFMT) == mode))
return 1;
}
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(tsk, rule->exe);
+ break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
aname->should_free = true;
}
- aname->ino = (unsigned long)-1;
+ aname->ino = AUDIT_INO_UNSET;
aname->type = type;
list_add_tail(&aname->list, &context->names_list);
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
- found_child->ino = (unsigned long)-1;
+ found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
}
late_initcall(register_array_map);
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
- /* only bpf_prog file descriptors can be stored in prog_array map */
+ /* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
}
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
- BUG_ON(array->prog[i] != NULL);
+ BUG_ON(array->ptrs[i] != NULL);
kvfree(array);
}
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
return NULL;
}
/* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *prog, *old_prog;
+ void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- prog = bpf_prog_get(ufd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (!bpf_prog_array_compatible(array, prog)) {
- bpf_prog_put(prog);
- return -EINVAL;
- }
+ new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
- old_prog = xchg(array->prog + index, prog);
- if (old_prog)
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
}
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *old_prog;
+ void *old_ptr;
u32 index = *(u32 *)key;
if (index >= array->map.max_entries)
return -E2BIG;
- old_prog = xchg(array->prog + index, NULL);
- if (old_prog) {
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
} else {
return -ENOENT;
}
}
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ if (IS_ERR(prog))
+ return prog;
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+ struct bpf_prog *prog = ptr;
+
+ bpf_prog_put_rcu(prog);
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- prog_array_map_delete_elem(map, &i);
+ fd_array_map_delete_elem(map, &i);
}
static const struct bpf_map_ops prog_array_ops = {
- .map_alloc = prog_array_map_alloc,
- .map_free = prog_array_map_free,
+ .map_alloc = fd_array_map_alloc,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_lookup_elem = prog_array_map_lookup_elem,
- .map_update_elem = prog_array_map_update_elem,
- .map_delete_elem = prog_array_map_delete_elem,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
};
static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct perf_event *event;
+ const struct perf_event_attr *attr;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return event;
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr))
+ return (void *)attr;
+
+ if (attr->type != PERF_TYPE_RAW &&
+ attr->type != PERF_TYPE_HARDWARE) {
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+ }
+ return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+ struct perf_event *event = ptr;
+
+ perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
/**
* __bpf_prog_run - run eBPF program on a given context
@@ -449,11 +450,15 @@ select_insn:
tail_call_cnt++;
- prog = READ_ONCE(array->prog[index]);
+ prog = READ_ONCE(array->ptrs[index]);
if (unlikely(!prog))
goto out;
- ARG1 = BPF_R1;
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handeled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
insn = prog->insnsi;
goto select_insn;
out:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..35bac8e8b071 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
/* prog_array stores refcnt-ed bpf_prog pointers
* release them all when user space closes prog_array_fd
*/
- bpf_prog_array_map_clear(map);
+ bpf_fd_array_map_clear(map);
bpf_map_put(map);
return 0;
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value, *ptr;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
+ f = fdget(ufd);
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
+ f = fdget(ufd);
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
+ struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
+ f = fdget(ufd);
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *unext_key = u64_to_ptr(attr->next_key);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *next_key;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
+ f = fdget(ufd);
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..b074b23000d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
+static const struct {
+ int map_type;
+ int func_id;
+} func_limit[] = {
+ {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -275,7 +283,7 @@ static const char *const bpf_class_string[] = {
[BPF_ALU64] = "alu64",
};
-static const char *const bpf_alu_string[] = {
+static const char *const bpf_alu_string[16] = {
[BPF_ADD >> 4] = "+=",
[BPF_SUB >> 4] = "-=",
[BPF_MUL >> 4] = "*=",
@@ -299,7 +307,7 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
-static const char *const bpf_jmp_string[] = {
+static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
struct verifier_state *state = &env->cur_state;
int size, err = 0;
+ if (state->regs[regno].type == PTR_TO_STACK)
+ off += state->regs[regno].imm;
+
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == FRAME_PTR) {
+ } else if (state->regs[regno].type == FRAME_PTR ||
+ state->regs[regno].type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return err;
}
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+ bool bool_map, bool_func;
+ int i;
+
+ if (!map)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+ bool_map = (map->map_type == func_limit[i].map_type);
+ bool_func = (func_id == func_limit[i].func_id);
+ /* only when map & func pair match it can continue.
+ * don't allow any other map type to be passed into
+ * the special func;
+ */
+ if (bool_map != bool_func)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_call(struct verifier_env *env, int func_id)
{
struct verifier_state *state = &env->cur_state;
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
- if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
- func_id != BPF_FUNC_tail_call)
- /* prog_array map type needs extra care:
- * only allow to pass it into bpf_tail_call() for now.
- * bpf_map_delete_elem() can be allowed in the future,
- * while bpf_map_update_elem() must only be done via syscall
- */
- return -EINVAL;
-
- if (func_id == BPF_FUNC_tail_call &&
- map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
- /* don't allow any other map type to be passed into
- * bpf_tail_call()
- */
- return -EINVAL;
+ err = check_map_func_compatibility(map, func_id);
+ if (err)
+ return err;
return 0;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b89f3168411b..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
* part of that cgroup.
*/
struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
+ struct cgroup_subsys *ss = cft->ss;
+
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cft->ss->name, cft->name);
+ cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1342,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
+ seq_show_option(seq, "name", root->name);
return 0;
}
@@ -1447,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
for_each_subsys(ss, i) {
- if (strcmp(token, ss->name))
+ if (strcmp(token, ss->legacy_name))
continue;
if (ss->disabled)
continue;
@@ -1666,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
lockdep_assert_held(&cgroup_mutex);
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
@@ -4579,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
if (err)
goto err_free_css;
- err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
goto err_free_percpu_ref;
css->id = err;
@@ -4656,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_cancel_ref;
@@ -4955,6 +4964,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -4993,6 +5003,8 @@ int __init cgroup_init_early(void)
ss->id = i;
ss->name = cgroup_subsys_name[i];
+ if (!ss->legacy_name)
+ ss->legacy_name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss, true);
@@ -5136,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "",
+ ss->legacy_name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
@@ -5178,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
+ ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+ return &ss_priv[i - CGROUP_CANFORK_START];
+ return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ void **private = subsys_canfork_priv_p(ss_priv, i);
+ return private ? *private : NULL;
+}
+
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5239,57 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ for_each_subsys_which(ss, i, &have_canfork_callback) {
+ ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ if (ret)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ }
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5221,7 +5299,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5266,7 +5345,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child);
+ ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
}
/**
@@ -5400,12 +5479,14 @@ static int __init cgroup_disable(char *str)
continue;
for_each_subsys(ss, i) {
- if (!strcmp(token, ss->name)) {
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group"
- " subsystem\n", ss->name);
- break;
- }
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group subsystem\n",
+ ss->name);
+ break;
}
}
return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * No need to pin @old_css between here and cancel_attach()
+ * because cgroup core protects it from being freed before
+ * the migration completes or fails.
+ */
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ }
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int err;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ css = task_get_css(current, pids_cgrp_id);
+ pids = css_pids(css);
+
+ err = pids_try_charge(pids, 1);
+ if (err)
+ goto err_css_put;
+
+ *priv_p = css;
+ return 0;
+
+err_css_put:
+ css_put(css);
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css = priv;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = priv;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ css = task_get_css(task, pids_cgrp_id);
+ pids = css_pids(css);
+
+ /*
+ * If the association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(old_css);
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .exit = pids_exit,
+ .legacy_cftypes = pids_files,
+ .dfl_cftypes = pids_files,
+};
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..71179a09c1d6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -20,11 +20,16 @@
#include <linux/cn_proc.h>
#if 0
-#define kdebug(FMT, ...) \
- printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__)
#else
-#define kdebug(FMT, ...) \
- no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+do { \
+ if (0) \
+ no_printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__); \
+} while (0)
#endif
static struct kmem_cache *cred_jar;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae16867670a9..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3222,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event)
return __perf_event_count(event);
}
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ * - either for the current task, or for this CPU
+ * - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ * - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+ unsigned long flags;
+ u64 val;
+
+ /*
+ * Disabling interrupts avoids all counter scheduling (context
+ * switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
+ /* If this is a per-task event, it must be for current */
+ WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current);
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id());
+
+ /*
+ * It must not be an event with inherit set, we cannot read
+ * all child counters from atomic context.
+ */
+ WARN_ON_ONCE(event->attr.inherit);
+
+ /*
+ * It must not have a pmu::count method, those are not
+ * NMI safe.
+ */
+ WARN_ON_ONCE(event->pmu->count);
+
+ /*
+ * If the event is currently on this CPU, its either a per-task event,
+ * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+ * oncpu == -1).
+ */
+ if (event->oncpu == smp_processor_id())
+ event->pmu->read(event);
+
+ val = local64_read(&event->count);
+ local_irq_restore(flags);
+
+ return val;
+}
+
static u64 perf_event_read(struct perf_event *event)
{
/*
@@ -8718,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ int err;
+ struct fd f;
+ struct perf_event *event;
+
+ err = perf_fget_light(fd, &f);
+ if (err)
+ return ERR_PTR(err);
+
+ event = f.file->private_data;
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+ if (!event)
+ return ERR_PTR(-EINVAL);
+
+ return &event->attr;
+}
+
/*
* inherit a event from parent task to child task:
*/
@@ -9016,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
-#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b1a61cddc19..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -1246,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1518,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted the the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, cgrp_ss_priv);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1553,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1595,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, cgrp_ss_priv);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1605,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
+#include <linux/fault-inject.h>
#include <asm/futex.h>
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
static struct futex_hash_bucket *futex_queues;
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ u32 ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = 0,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
+
static inline void futex_get_mm(union futex_key *key)
{
atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
err = get_user_pages_fast(address, 1, 1, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
- if (ro) {
+ if (unlikely(should_fail_futex(fshared)) || ro) {
err = -EFAULT;
goto out;
}
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 uninitialized_var(curval);
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Detect deadlocks.
*/
if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
/*
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ if (unlikely(should_fail_futex(true)))
+ ret = -EFAULT;
+
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Find the top_waiter and determine if there are additional waiters.
* If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
*/
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
@@ -2300,6 +2386,10 @@ retry_private:
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
switch (ret) {
case 1:
/* We got the lock. */
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
+ * the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
return;
jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_ENABLE);
- else
- jump_label_update(key, JUMP_LABEL_DISABLE);
- }
- atomic_inc(&key->enabled);
+ if (atomic_inc_return(&key->enabled) == 1)
+ jump_label_update(key);
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
} else {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_DISABLE);
- else
- jump_label_update(key, JUMP_LABEL_ENABLE);
+ jump_label_update(key);
}
jump_label_unlock();
}
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
+/*
* Update code which is definitely not currently executing.
* Architectures which need heavyweight synchronization to modify
* running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ arch_jump_label_transform(entry, type);
+}
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+ return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+ return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+ return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+ return (unsigned long)entry->key & 1UL;
+}
+
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool enabled = static_key_enabled(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return enabled ^ branch;
}
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
- struct jump_entry *stop, int enable)
+ struct jump_entry *stop)
{
- for (; (entry < stop) &&
- (entry->key == (jump_label_t)(unsigned long)key);
- entry++) {
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
* init code, see jump_label_invalidate_module_init().
*/
if (entry->code && kernel_text_address(entry->code))
- arch_jump_label_transform(entry, enable);
+ arch_jump_label_transform(entry, jump_label_type(entry));
}
}
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
- bool true_branch = jump_label_get_branch_default(key);
- bool state = static_key_enabled(key);
-
- if ((!true_branch && state) || (true_branch && !state))
- return JUMP_LABEL_ENABLE;
-
- return JUMP_LABEL_DISABLE;
-}
-
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ /* rewrite NOPs */
+ if (jump_label_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool type = static_key_type(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return type ^ branch;
+}
+
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
start, end);
}
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
{
- struct static_key_mod *mod = key->next;
+ struct static_key_mod *mod;
- while (mod) {
+ for (mod = key->next; mod; mod = mod->next) {
struct module *m = mod->mod;
__jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries,
- enable);
- mod = mod->next;
+ m->jump_entries + m->num_jump_entries);
}
}
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ /* Only write NOPs for arch_branch_static(). */
+ if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
}
}
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
jlm->next = key->next;
key->next = jlm;
- if (jump_label_type(key) == JUMP_LABEL_ENABLE)
- __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ /* Only update if we've changed from our initial state */
+ if (jump_label_type(iter) != jump_label_init_type(iter))
+ __jump_label_update(key, iter, iter_stop);
}
return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
+ if (jump_entry_key(iter) == key)
continue;
- key = (struct static_key *)(unsigned long)iter->key;
+ key = jump_entry_key(iter);
if (within_module(iter->key, mod))
continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = jump_label_get_entries(key);
+ struct jump_entry *entry = static_key_entries(key);
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key, enable);
+ __jump_label_mod_update(key);
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
#endif
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop, enable);
+ __jump_label_update(key, entry, stop);
}
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(static_key_enabled(&sk_true.key) != true);
+ WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+ WARN_ON(!static_branch_likely(&sk_true));
+ WARN_ON(!static_branch_unlikely(&sk_true));
+ WARN_ON(static_branch_likely(&sk_false));
+ WARN_ON(static_branch_unlikely(&sk_false));
+
+ static_branch_disable(&sk_true);
+ static_branch_enable(&sk_false);
+
+ WARN_ON(static_key_enabled(&sk_true.key) == true);
+ WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+ WARN_ON(static_branch_likely(&sk_true));
+ WARN_ON(static_branch_unlikely(&sk_true));
+ WARN_ON(!static_branch_likely(&sk_false));
+ WARN_ON(!static_branch_unlikely(&sk_false));
+
+ static_branch_enable(&sk_true);
+ static_branch_disable(&sk_false);
+ }
+
+ return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,22 @@
/*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
-
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
#include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in do_exit() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses. On processors
- * where you can disable the MMU this is trivial, and easy. For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place. This means I can only support memory whose
- * physical address can fit in an unsigned long. In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages. As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it). The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- * - allocating a page table with the control code buffer identity
- * mapped, to simplify machine_kexec and make kexec_on_panic more
- * reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long dest);
+#include "kexec_internal.h"
static int copy_user_segment_list(struct kimage *image,
unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
return ret;
}
-static int sanity_check_segment_list(struct kimage *image)
-{
- int result, i;
- unsigned long nr_segments = image->nr_segments;
-
- /*
- * Verify we have good destination addresses. The caller is
- * responsible for making certain we don't attempt to load
- * the new image into invalid or reserved areas of RAM. This
- * just verifies it is an address we can use.
- *
- * Since the kernel does everything in page size chunks ensure
- * the destination addresses are page aligned. Too many
- * special cases crop of when we don't do this. The most
- * insidious is getting overlapping destination addresses
- * simply because addresses are changed to page size
- * granularity.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
- if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
- }
-
- /* Verify our destination addresses do not overlap.
- * If we alloed overlapping destination addresses
- * through very weird things can happen with no
- * easy explanation as one segment stops on another.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
- unsigned long j;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- for (j = 0; j < i; j++) {
- unsigned long pstart, pend;
- pstart = image->segment[j].mem;
- pend = pstart + image->segment[j].memsz;
- /* Do the segments overlap ? */
- if ((mend > pstart) && (mstart < pend))
- return result;
- }
- }
-
- /* Ensure our buffer sizes are strictly less than
- * our memory sizes. This should always be the case,
- * and it is easier to check up front than to be surprised
- * later on.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
- }
-
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
-
- if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
- }
- }
-
- return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
- struct kimage *image;
-
- /* Allocate a controlling structure */
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- return NULL;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unusable_pages);
-
- return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments,
@@ -354,873 +101,6 @@ out_free_image:
return ret;
}
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
- struct fd f = fdget(fd);
- int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
- struct purgatory_info *pi = &image->purgatory_info;
-
- vfree(image->kernel_buf);
- image->kernel_buf = NULL;
-
- vfree(image->initrd_buf);
- image->initrd_buf = NULL;
-
- kfree(image->cmdline_buf);
- image->cmdline_buf = NULL;
-
- vfree(pi->purgatory_buf);
- pi->purgatory_buf = NULL;
-
- vfree(pi->sechdrs);
- pi->sechdrs = NULL;
-
- /* See if architecture has anything to cleanup post load */
- arch_kimage_file_post_load_cleanup(image);
-
- /*
- * Above call should have called into bootloader to free up
- * any data stored in kimage->image_loader_data. It should
- * be ok now to free it up.
- */
- kfree(image->image_loader_data);
- image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
- const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned flags)
-{
- int ret = 0;
- void *ldata;
-
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
- if (ret)
- return ret;
-
- /* Call arch image probe handlers */
- ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
- image->kernel_buf_len);
-
- if (ret)
- goto out;
-
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
- goto out;
- }
- pr_debug("kernel signature verification successful.\n");
-#endif
- /* It is possible that there no initramfs is being loaded */
- if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
- if (ret)
- goto out;
- }
-
- if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- image->cmdline_buf_len = cmdline_len;
-
- /* command line should be a string with last byte null */
- if (image->cmdline_buf[cmdline_len - 1] != '\0') {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
-
- if (IS_ERR(ldata)) {
- ret = PTR_ERR(ldata);
- goto out;
- }
-
- image->image_loader_data = ldata;
-out:
- /* In case of error, free up all allocated memory in this function */
- if (ret)
- kimage_file_post_load_cleanup(image);
- return ret;
-}
-
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
- int initrd_fd, const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned long flags)
-{
- int ret;
- struct kimage *image;
- bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
- image = do_kimage_alloc_init();
- if (!image)
- return -ENOMEM;
-
- image->file_mode = 1;
-
- if (kexec_on_panic) {
- /* Enable special crash kernel control page alloc policy. */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
- }
-
- ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
- cmdline_ptr, cmdline_len, flags);
- if (ret)
- goto out_free_image;
-
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_post_load_bufs;
-
- ret = -ENOMEM;
- image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_PAGE_SIZE));
- if (!image->control_code_page) {
- pr_err("Could not allocate control_code_buffer\n");
- goto out_free_post_load_bufs;
- }
-
- if (!kexec_on_panic) {
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free_control_pages;
- }
- }
-
- *rimage = image;
- return 0;
-out_free_control_pages:
- kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
- kimage_file_post_load_cleanup(image);
-out_free_image:
- kfree(image);
- return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start,
- unsigned long end)
-{
- unsigned long i;
-
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
- return 1;
- }
-
- return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *pages;
-
- pages = alloc_pages(gfp_mask, order);
- if (pages) {
- unsigned int count, i;
- pages->mapping = NULL;
- set_page_private(pages, order);
- count = 1 << order;
- for (i = 0; i < count; i++)
- SetPageReserved(pages + i);
- }
-
- return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
- unsigned int order, count, i;
-
- order = page_private(page);
- count = 1 << order;
- for (i = 0; i < count; i++)
- ClearPageReserved(page + i);
- __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
- struct list_head *pos, *next;
-
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
- list_del(&page->lru);
- kimage_free_pages(page);
- }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * At worst this runs in O(N) of the image size.
- */
- struct list_head extra_pages;
- struct page *pages;
- unsigned int count;
-
- count = 1 << order;
- INIT_LIST_HEAD(&extra_pages);
-
- /* Loop while I can allocate a page and the page allocated
- * is a destination page.
- */
- do {
- unsigned long pfn, epfn, addr, eaddr;
-
- pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
- if (!pages)
- break;
- pfn = page_to_pfn(pages);
- epfn = pfn + count;
- addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
- if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr)) {
- list_add(&pages->lru, &extra_pages);
- pages = NULL;
- }
- } while (!pages);
-
- if (pages) {
- /* Remember the allocated page... */
- list_add(&pages->lru, &image->control_pages);
-
- /* Because the page is already in it's destination
- * location we will never allocate another page at
- * that address. Therefore kimage_alloc_pages
- * will not return it (again) and we don't need
- * to give it an entry in image->segment[].
- */
- }
- /* Deal with the destination pages I have inadvertently allocated.
- *
- * Ideally I would convert multi-page allocations into single
- * page allocations, and add everything to image->dest_pages.
- *
- * For now it is simpler to just free the pages.
- */
- kimage_free_page_list(&extra_pages);
-
- return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * Control pages are also the only pags we must allocate
- * when loading a crash kernel. All of the other pages
- * are specified by the segments and we just memcpy
- * into them directly.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * Given the low demand this implements a very simple
- * allocator that finds the first hole of the appropriate
- * size in the reserved memory region, and allocates all
- * of the memory up to and including the hole.
- */
- unsigned long hole_start, hole_end, size;
- struct page *pages;
-
- pages = NULL;
- size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
- unsigned long i;
-
- if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
- break;
- /* See if I overlap any of the segments */
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- if ((hole_end >= mstart) && (hole_start <= mend)) {
- /* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- break;
- }
- }
- /* If I don't overlap any segments I have found my hole! */
- if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- break;
- }
- }
- if (pages)
- image->control_page = hole_end;
-
- return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
- unsigned int order)
-{
- struct page *pages = NULL;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- pages = kimage_alloc_normal_control_pages(image, order);
- break;
- case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
- break;
- }
-
- return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
- if (*image->entry != 0)
- image->entry++;
-
- if (image->entry == image->last_entry) {
- kimage_entry_t *ind_page;
- struct page *page;
-
- page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page)
- return -ENOMEM;
-
- ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
- image->entry = ind_page;
- image->last_entry = ind_page +
- ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
- }
- *image->entry = entry;
- image->entry++;
- *image->entry = 0;
-
- return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
- unsigned long destination)
-{
- int result;
-
- destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
-
- return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
- int result;
-
- page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
-
- return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
- /* Walk through and free any extra destination pages I may have */
- kimage_free_page_list(&image->dest_pages);
-
- /* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
- if (*image->entry != 0)
- image->entry++;
-
- *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
- for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
- struct page *page;
-
- page = pfn_to_page(entry >> PAGE_SHIFT);
- kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
- kimage_entry_t *ptr, entry;
- kimage_entry_t ind = 0;
-
- if (!image)
- return;
-
- kimage_free_extra_pages(image);
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_INDIRECTION) {
- /* Free the previous indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
- /* Save this indirection page until we are
- * done with it.
- */
- ind = entry;
- } else if (entry & IND_SOURCE)
- kimage_free_entry(entry);
- }
- /* Free the final indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
-
- /* Handle any machine specific cleanup */
- machine_kexec_cleanup(image);
-
- /* Free the kexec control pages... */
- kimage_free_page_list(&image->control_pages);
-
- /*
- * Free up any temporary buffers allocated. This might hit if
- * error occurred much later after buffer allocation.
- */
- if (image->file_mode)
- kimage_file_post_load_cleanup(image);
-
- kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
- unsigned long page)
-{
- kimage_entry_t *ptr, entry;
- unsigned long destination = 0;
-
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION)
- destination = entry & PAGE_MASK;
- else if (entry & IND_SOURCE) {
- if (page == destination)
- return ptr;
- destination += PAGE_SIZE;
- }
- }
-
- return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long destination)
-{
- /*
- * Here we implement safeguards to ensure that a source page
- * is not copied to its destination page before the data on
- * the destination page is no longer useful.
- *
- * To do this we maintain the invariant that a source page is
- * either its own destination page, or it is not a
- * destination page at all.
- *
- * That is slightly stronger than required, but the proof
- * that no problems will not occur is trivial, and the
- * implementation is simply to verify.
- *
- * When allocating all pages normally this algorithm will run
- * in O(N) time, but in the worst case it will run in O(N^2)
- * time. If the runtime is a problem the data structures can
- * be fixed.
- */
- struct page *page;
- unsigned long addr;
-
- /*
- * Walk through the list of destination pages, and see if I
- * have a match.
- */
- list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
- if (addr == destination) {
- list_del(&page->lru);
- return page;
- }
- }
- page = NULL;
- while (1) {
- kimage_entry_t *old;
-
- /* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
- if (!page)
- return NULL;
- /* If the page cannot be used file it away */
- if (page_to_pfn(page) >
- (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unusable_pages);
- continue;
- }
- addr = page_to_pfn(page) << PAGE_SHIFT;
-
- /* If it is the destination page we want use it */
- if (addr == destination)
- break;
-
- /* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
- break;
-
- /*
- * I know that the page is someones destination page.
- * See if there is already a source page for this
- * destination page. And if so swap the source pages.
- */
- old = kimage_dst_used(image, addr);
- if (old) {
- /* If so move it */
- unsigned long old_addr;
- struct page *old_page;
-
- old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
- copy_highpage(page, old_page);
- *old = addr | (*old & ~PAGE_MASK);
-
- /* The old page I have found cannot be a
- * destination page, so return it if it's
- * gfp_flags honor the ones passed in.
- */
- if (!(gfp_mask & __GFP_HIGHMEM) &&
- PageHighMem(old_page)) {
- kimage_free_pages(old_page);
- continue;
- }
- addr = old_addr;
- page = old_page;
- break;
- } else {
- /* Place the page on the destination list I
- * will use it later.
- */
- list_add(&page->lru, &image->dest_pages);
- }
- }
-
- return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
-
- result = kimage_set_destination(image, maddr);
- if (result < 0)
- goto out;
-
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- result = kimage_add_page(image, page_to_pfn(page)
- << PAGE_SHIFT);
- if (result < 0)
- goto out;
-
- ptr = kmap(page);
- /* Start with a clear page */
- clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
- * We do things a page at a time for the sake of kmap.
- */
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
- if (mchunk > uchunk) {
- /* Zero the trailing part of the page */
- memset(ptr + uchunk, 0, mchunk - uchunk);
- }
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kexec_flush_icache_page(page);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- int result = -ENOMEM;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
- break;
- case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
- break;
- }
-
- return result;
-}
-
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +215,6 @@ out:
return result;
}
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, nr_segments,
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
- unsigned long, cmdline_len, const char __user *, cmdline_ptr,
- unsigned long, flags)
-{
- int ret = 0, i;
- struct kimage **dest_image, *image;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
- return -EPERM;
-
- /* Make sure we have a legal set of flags */
- if (flags != (flags & KEXEC_FILE_FLAGS))
- return -EINVAL;
-
- image = NULL;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
- dest_image = &kexec_crash_image;
-
- if (flags & KEXEC_FILE_UNLOAD)
- goto exchange;
-
- /*
- * In case of crash, new kernel gets loaded in reserved region. It is
- * same memory where old crash kernel might be loaded. Free any
- * current crash dump kernel before we corrupt it.
- */
- if (flags & KEXEC_FILE_ON_CRASH)
- kimage_free(xchg(&kexec_crash_image, NULL));
-
- ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
- cmdline_len, flags);
- if (ret)
- goto out;
-
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
-
- ret = kexec_calculate_store_digests(image);
- if (ret)
- goto out;
-
- for (i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
-
- ret = kimage_load_segment(image, &image->segment[i]);
- if (ret)
- goto out;
- }
-
- kimage_terminate(image);
-
- /*
- * Free up any temporary buffers allocated which are not needed
- * after image has been loaded
- */
- kimage_file_post_load_cleanup(image);
-exchange:
- image = xchg(dest_image, image);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
- return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_map_reserved_pages();
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
-
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- crash_notes = alloc_percpu(note_buf_t);
- if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
-
- do {
- /* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
-
- if (temp_start < start || temp_start < kbuf->buf_min)
- return 0;
-
- temp_end = temp_start + kbuf->memsz - 1;
-
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start - PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_start = max(start, kbuf->buf_min);
-
- do {
- temp_start = ALIGN(temp_start, kbuf->buf_align);
- temp_end = temp_start + kbuf->memsz - 1;
-
- if (temp_end > end || temp_end > kbuf->buf_max)
- return 0;
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start + PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
- struct kexec_buf *kbuf = (struct kexec_buf *)arg;
- unsigned long sz = end - start + 1;
-
- /* Returning 0 will take to next memory range */
- if (sz < kbuf->memsz)
- return 0;
-
- if (end < kbuf->buf_min || start > kbuf->buf_max)
- return 0;
-
- /*
- * Allocate memory top down with-in ram range. Otherwise bottom up
- * allocation.
- */
- if (kbuf->top_down)
- return locate_mem_hole_top_down(start, end, kbuf);
- return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
-{
-
- struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
- int ret;
-
- /* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
- return -EINVAL;
-
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
- /*
- * Make sure we are not trying to add buffer after allocating
- * control pages. All segments need to be placed first before
- * any control pages are allocated. As control page allocation
- * logic goes through list of segments to make sure there are
- * no destination overlaps.
- */
- if (!list_empty(&image->control_pages)) {
- WARN_ON(1);
- return -EINVAL;
- }
-
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
-
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
-
- /* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = kbuf->buffer;
- ksegment->bufsz = kbuf->bufsz;
- ksegment->mem = kbuf->mem;
- ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
- return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
- void *zero_buf;
- struct kexec_sha_region *sha_regions;
- struct purgatory_info *pi = &image->purgatory_info;
-
- zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
- zero_buf_sz = PAGE_SIZE;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
- sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
- sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
- goto out_free_desc;
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
-
- for (j = i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- /*
- * Skip purgatory as it will be modified once we put digest
- * info in purgatory.
- */
- if (ksegment->kbuf == pi->purgatory_buf)
- continue;
-
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
-
- /*
- * Assume rest of the buffer is filled with zero and
- * update digest accordingly.
- */
- nullsz = ksegment->memsz - ksegment->bufsz;
- while (nullsz) {
- unsigned long bytes = nullsz;
-
- if (bytes > zero_buf_sz)
- bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
- nullsz -= bytes;
- }
-
- if (ret)
- break;
-
- sha_regions[j].start = ksegment->mem;
- sha_regions[j].len = ksegment->memsz;
- j++;
- }
-
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
-
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
-
-out_free_digest:
- kfree(digest);
-out_free_sha_regions:
- vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
- return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
-
- /* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
- bss_align = 1;
- buf_sz = 0;
- bss_sz = 0;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
- } else {
- /* bss section */
- if (bss_align < align)
- bss_align = align;
- bss_sz = ALIGN(bss_sz, align);
- bss_sz += sechdrs[i].sh_size;
- }
- }
-
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
- memsz = buf_sz + bss_pad + bss_sz;
-
- /* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (buf_align < bss_align)
- buf_align = bss_align;
-
- /* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
- if (ret)
- goto out;
-
- /* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
- bss_addr = ALIGN(bss_addr, align);
- sechdrs[i].sh_addr = bss_addr;
- bss_addr += sechdrs[i].sh_size;
- }
- }
-
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
-
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
-
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
-
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = purgatory_buf;
- return ret;
-out:
- vfree(sechdrs);
- vfree(purgatory_buf);
- return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
- int i, ret;
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
-
- /* Apply relocations */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
-
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
- continue;
-
- /*
- * For section of type SHT_RELA/SHT_REL,
- * ->sh_link contains section header index of associated
- * symbol table. And ->sh_info contains section header
- * index of section to which relocations apply.
- */
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
- return -ENOEXEC;
-
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
-
- if (!(section->sh_flags & SHF_ALLOC))
- continue;
-
- /*
- * symtab->sh_link contain section header index of associated
- * string table.
- */
- if (symtab->sh_link >= pi->ehdr->e_shnum)
- /* Invalid section number? */
- continue;
-
- /*
- * Respective architecture needs to provide support for applying
- * relocations of type SHT_RELA/SHT_REL.
- */
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- int ret;
-
- if (kexec_purgatory_size <= 0)
- return -EINVAL;
-
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
-
- ret = __kexec_load_purgatory(image, min, max, top_down);
- if (ret)
- return ret;
-
- ret = kexec_apply_relocations(image);
- if (ret)
- goto out;
-
- *load_addr = pi->purgatory_load_addr;
- return 0;
-out:
- vfree(pi->sechdrs);
- vfree(pi->purgatory_buf);
- return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
- const char *strtab;
-
- if (!pi->sechdrs || !pi->ehdr)
- return NULL;
-
- sechdrs = pi->sechdrs;
- ehdr = pi->ehdr;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
- Elf_Shdr *sechdr;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return ERR_PTR(-EINVAL);
-
- sechdr = &pi->sechdrs[sym->st_shndx];
-
- /*
- * Returns the address where symbol will finally be loaded after
- * kexec_load_segment()
- */
- return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
- void *buf, unsigned int size, bool get_value)
-{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
- struct purgatory_info *pi = &image->purgatory_info;
- char *sym_buf;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return -EINVAL;
-
- if (sym->st_size != size) {
- pr_err("symbol %s size mismatch: expected %lu actual %u\n",
- name, (unsigned long)sym->st_size, size);
- return -EINVAL;
- }
-
- sechdrs = pi->sechdrs;
-
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- pr_err("symbol %s is in a bss section. Cannot %s\n", name,
- get_value ? "get" : "set");
- return -EINVAL;
- }
-
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
-
- if (get_value)
- memcpy((void *)buf, sym_buf, size);
- else
- memcpy((void *)sym_buf, buf, size);
-
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
- int error = 0;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
- if (!kexec_image) {
- error = -EINVAL;
- goto Unlock;
- }
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- lock_system_sleep();
- pm_prepare_console();
- error = freeze_processes();
- if (error) {
- error = -EBUSY;
- goto Restore_console;
- }
- suspend_console();
- error = dpm_suspend_start(PMSG_FREEZE);
- if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
- */
- error = dpm_suspend_end(PMSG_FREEZE);
- if (error)
- goto Resume_devices;
- error = disable_nonboot_cpus();
- if (error)
- goto Enable_cpus;
- local_irq_disable();
- error = syscore_suspend();
- if (error)
- goto Enable_irqs;
- } else
-#endif
- {
- kexec_in_progress = true;
- kernel_restart_prepare(NULL);
- migrate_to_reboot_cpu();
-
- /*
- * migrate_to_reboot_cpu() disables CPU hotplug assuming that
- * no further code needs to use CPU hotplug (which is true in
- * the reboot case). However, the kexec path depends on using
- * CPU hotplug again; so re-enable it here.
- */
- cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
- machine_shutdown();
- }
-
- machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- syscore_resume();
- Enable_irqs:
- local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
- dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
- dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
- thaw_processes();
- Restore_console:
- pm_restore_console();
- unlock_system_sleep();
- }
-#endif
-
- Unlock:
- mutex_unlock(&kexec_mutex);
- return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..201b45327804
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return result;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return result;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return result;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return result;
+ }
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ image->control_page = hole_end;
+ break;
+ }
+ }
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ /* Place the page on the destination list, to be used later */
+ list_add(&page->lru, &image->dest_pages);
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+size_t crash_get_memory_size(void)
+{
+ size_t size = 0;
+
+ mutex_lock(&kexec_mutex);
+ if (crashk_res.end != crashk_res.start)
+ size = resource_size(&crashk_res);
+ mutex_unlock(&kexec_mutex);
+ return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
+
+ mutex_lock(&kexec_mutex);
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ start = crashk_res.start;
+ end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+ end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+ crash_map_reserved_pages();
+ crash_free_reserved_phys_range(end, crashk_res.end);
+
+ if ((start == end) && (crashk_res.parent != NULL))
+ release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
+ crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
+ crash_unmap_reserved_pages();
+
+unlock:
+ mutex_unlock(&kexec_mutex);
+ return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ lock_system_sleep();
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare(NULL);
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ unlock_system_sleep();
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..da98d0593de2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
extern int max_threads;
-static struct workqueue_struct *khelper_wq;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -114,10 +112,11 @@ out:
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
/*
* This is the task which runs the usermode application
*/
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
{
struct subprocess_info *sub_info = data;
struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
flush_signal_handlers(current, 1);
spin_unlock_irq(&current->sighand->siglock);
- /* We can run anywhere, unlike our parent keventd(). */
- set_cpus_allowed_ptr(current, cpu_all_mask);
-
/*
- * Our parent is keventd, which runs with elevated scheduling priority.
- * Avoid propagating that into the userspace child.
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
(const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
- /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
@@ -266,15 +265,14 @@ out:
do_exit(0);
}
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
- struct subprocess_info *sub_info = data;
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
- * But wait_for_helper() always runs as keventd, and put_user()
- * to a kernel address works OK for kernel threads, due to their
- * having an mm_segment_t which spans the entire address space.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *)&ret, 0, NULL);
/*
- * If ret is 0, either ____call_usermodehelper failed and the
- * real error code is already in sub_info->retval or
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
* sub_info->retval is 0 anyway, so don't mess with it then.
*/
if (ret)
sub_info->retval = ret;
}
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
umh_complete(sub_info);
- do_exit(0);
}
-/* This is run by khelper thread */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- pid_t pid;
- if (sub_info->wait & UMH_WAIT_PROC)
- pid = kernel_thread(wait_for_helper, sub_info,
- CLONE_FS | CLONE_FILES | SIGCHLD);
- else
- pid = kernel_thread(____call_usermodehelper, sub_info,
- SIGCHLD);
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
}
}
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
if (!sub_info)
goto out;
- INIT_WORK(&sub_info->work, __call_usermodehelper);
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
* from interrupt context.
*
* Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
return -EINVAL;
}
helper_lock();
- if (!khelper_wq || usermodehelper_disabled) {
+ if (usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
- queue_work(khelper_wq, &sub_info->work);
+ queue_work(system_unbound_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
},
{ }
};
-
-void __init usermodehelper_init(void)
-{
- khelper_wq = create_singlethread_workqueue("khelper");
- BUG_ON(!khelper_wq);
-}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
__up_read(&brw->rw_sem);
}
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+ if (unlikely(!update_fast_ctr(brw, +1))) {
+ if (!__down_read_trylock(&brw->rw_sem))
+ return 0;
+ atomic_inc(&brw->slow_read_ctr);
+ __up_read(&brw->rw_sem);
+ }
+
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ return 1;
+}
+
void percpu_up_read(struct percpu_rw_semaphore *brw)
{
rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
cpu_relax_lowlatency();
- cnts = smp_load_acquire((u32 *)&lock->cnts);
+ cnts = atomic_read_acquire(&lock->cnts);
}
}
/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin (with ACQUIRE
+ * semantics) until the lock is available without waiting in
+ * the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
arch_spin_lock(&lock->lock);
/*
- * At the head of the wait queue now, wait until the writer state
- * goes to 0 and then try to increment the reader count and get
- * the lock. It is possible that an incoming writer may steal the
- * lock in the interim, so it is necessary to check the writer byte
- * to make sure that the write lock isn't taken.
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
*/
- while (atomic_read(&lock->cnts) & _QW_WMASK)
- cpu_relax_lowlatency();
-
- cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
/*
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
*/
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
/* Try to acquire the lock directly if no reader is present */
if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
struct __qrwlock *l = (struct __qrwlock *)lock;
if (!READ_ONCE(l->wmode) &&
- (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
+ (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
for (;;) {
cnts = atomic_read(&lock->cnts);
if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
+ (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
break;
cpu_relax_lowlatency();
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
unlock:
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }
@@ -440,7 +440,7 @@ queue:
cpu_relax();
arch_mcs_spin_unlock_contended(&next->locked);
- pv_kick_node(next);
+ pv_kick_node(lock, next);
release:
/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index df19ae4debd0..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,9 +22,14 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
enum vcpu_state {
vcpu_running = 0,
- vcpu_halted,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};
struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_running
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
*
- * Matches the xchg() from pv_kick_node().
+ * Matches the cmpxchg() from pv_kick_node().
*/
smp_store_mb(pn->state, vcpu_halted);
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
pv_wait(&pn->state, vcpu_halted);
/*
- * Reset the vCPU state to avoid unncessary CPU kicking
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
+ * so that pv_wait_head() knows to not also try to hash this lock.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}
+
/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
}
/*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
*/
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
/*
- * Note that because node->locked is already set, this actual
- * mcs_spinlock entry could be re-used already.
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
*
- * This should be fine however, kicking people for no reason is
- * harmless.
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ */
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
*
- * See the comment in pv_wait_node().
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
- pv_kick(pn->cpu);
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
}
/*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct qspinlock **lp = NULL;
int loop;
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
@@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
cpu_relax();
}
- WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
+ WRITE_ONCE(pn->state, vcpu_hashed);
lp = pv_hash(lock, pn);
+
/*
- * lp must be set before setting _Q_SLOW_VAL
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
*
- * [S] lp = lock [RmW] l = l->locked = 0
- * MB MB
- * [S] l->locked = _Q_SLOW_VAL [L] lp
+ * [S] pn->state
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ * [L] pn->state
*
- * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
/*
@@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- if (likely(lockval == _Q_LOCKED_VAL))
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
return;
- if (unlikely(lockval != _Q_SLOW_VAL)) {
- if (debug_locks_silent)
- return;
- WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
return;
}
/*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
* Since the above failed to release, this must be the SLOW path.
* Therefore start by looking up the blocked node and unhashing it.
*/
@@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
*/
- if (READ_ONCE(node->state) == vcpu_halted)
+ if (READ_ONCE(node->state) == vcpu_hashed)
pv_kick(node->cpu);
}
/*
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS 8
-#define MAX_RT_TEST_MUTEXES 8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
- int opcode;
- int opdata;
- int mutexes[MAX_RT_TEST_MUTEXES];
- int event;
- struct device dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
- RTTEST_NOP = 0,
- RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
- RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
- RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
- RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
- RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
- RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- /* 9, 10 - reserved for BKL commemoration */
- RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
- RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
- RTTEST_RESET = 99, /* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
- int i, id, ret = -EINVAL;
-
- switch(td->opcode) {
-
- case RTTEST_NOP:
- return 0;
-
- case RTTEST_LOCKCONT:
- td->mutexes[td->opdata] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return 0;
-
- case RTTEST_RESET:
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
- if (td->mutexes[i] == 4) {
- rt_mutex_unlock(&mutexes[i]);
- td->mutexes[i] = 0;
- }
- }
- return 0;
-
- case RTTEST_RESETEVENT:
- atomic_set(&rttest_event, 0);
- return 0;
-
- default:
- if (lockwakeup)
- return ret;
- }
-
- switch(td->opcode) {
-
- case RTTEST_LOCK:
- case RTTEST_LOCKNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_lock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 4;
- return 0;
-
- case RTTEST_LOCKINT:
- case RTTEST_LOCKINTNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = ret ? 0 : 4;
- return ret ? -EINTR : 0;
-
- case RTTEST_UNLOCK:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
- return ret;
-
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_unlock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 0;
- return 0;
-
- default:
- break;
- }
- return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
- int tid, op, dat;
- struct test_thread_data *td;
-
- /* We have to lookup the task */
- for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
- if (threads[tid] == current)
- break;
- }
-
- BUG_ON(tid == MAX_RT_TEST_THREADS);
-
- td = &thread_data[tid];
-
- op = td->opcode;
- dat = td->opdata;
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- break;
-
- if (td->mutexes[dat] != 1)
- break;
-
- td->mutexes[dat] = 2;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- default:
- break;
- }
-
- schedule();
-
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 3;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return;
-
- default:
- return;
- }
-
- td->opcode = 0;
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- int ret;
-
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 1);
- set_current_state(TASK_INTERRUPTIBLE);
- if (td->opcode == RTTEST_LOCKCONT)
- break;
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- }
-
- /* Restore previous command and data */
- td->opcode = op;
- td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
- struct test_thread_data *td = data;
- int ret;
-
- current->flags |= PF_MUTEX_TESTER;
- set_freezable();
- allow_signal(SIGHUP);
-
- for(;;) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 0);
- set_current_state(TASK_INTERRUPTIBLE);
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- try_to_freeze();
-
- if (signal_pending(current))
- flush_signals(current);
-
- if(kthread_should_stop())
- break;
- }
- return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev: thread reference
- * @buf: command for actual step
- * @count: length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct sched_param schedpar;
- struct test_thread_data *td;
- char cmdbuf[32];
- int op, dat, tid, ret;
-
- td = container_of(dev, struct test_thread_data, dev);
- tid = td->dev.id;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(cmdbuf))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
- if (count < 1)
- return -EINVAL;
-
- memcpy(cmdbuf, buf, count);
- cmdbuf[count] = 0;
-
- if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
- return -EINVAL;
-
- switch (op) {
- case RTTEST_SCHEDOT:
- schedpar.sched_priority = 0;
- ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
- if (ret)
- return ret;
- set_user_nice(current, 0);
- break;
-
- case RTTEST_SCHEDRT:
- schedpar.sched_priority = dat;
- ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
- if (ret)
- return ret;
- break;
-
- case RTTEST_SIGNAL:
- send_sig(SIGHUP, threads[tid], 0);
- break;
-
- default:
- if (td->opcode > 0)
- return -EBUSY;
- td->opdata = dat;
- td->opcode = op;
- wake_up_process(threads[tid]);
- }
-
- return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev: thread to query
- * @buf: char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct test_thread_data *td;
- struct task_struct *tsk;
- char *curr = buf;
- int i;
-
- td = container_of(dev, struct test_thread_data, dev);
- tsk = threads[td->dev.id];
-
- spin_lock(&rttest_lock);
-
- curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
- td->opcode, td->event, tsk->state,
- (MAX_RT_PRIO - 1) - tsk->prio,
- (MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on);
-
- for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
- curr += sprintf(curr, "%d", td->mutexes[i]);
-
- spin_unlock(&rttest_lock);
-
- curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->dev.id].owner);
-
- return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
- .name = "rttest",
- .dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
- thread_data[id].dev.bus = &rttest_subsys;
- thread_data[id].dev.id = id;
-
- threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
- if (IS_ERR(threads[id]))
- return PTR_ERR(threads[id]);
-
- return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
- int ret, i;
-
- spin_lock_init(&rttest_lock);
-
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
- rt_mutex_init(&mutexes[i]);
-
- ret = subsys_system_register(&rttest_subsys, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
- ret = init_test_thread(i);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
- if (ret)
- break;
- }
-
- printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
- return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
debug_rt_mutex_print_deadlock(waiter);
- schedule_rt_mutex(lock);
+ schedule();
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
#include <linux/rtmutex.h>
/*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock) \
- do { \
- if (!(current->flags & PF_MUTEX_TESTER)) \
- schedule(); \
- else \
- schedule_rt_mutex_test(_lock); \
- } while (0)
-
-#else
-# define schedule_rt_mutex(_lock) schedule()
-#endif
-
-/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..72b0c66628b6
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+ return ioremap(offset, size);
+}
+#endif
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture. This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility. Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+ int is_ram = region_intersects(offset, size, "System RAM");
+ void *addr = NULL;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ /* Try all mapping types requested until one returns non-NULL */
+ if (flags & MEMREMAP_WB) {
+ flags &= ~MEMREMAP_WB;
+ /*
+ * MEMREMAP_WB is special in that it can be satisifed
+ * from the direct map. Some archs depend on the
+ * capability of memremap() to autodetect cases where
+ * the requested range is potentially in "System RAM"
+ */
+ if (is_ram == REGION_INTERSECTS)
+ addr = __va(offset);
+ else
+ addr = ioremap_cache(offset, size);
+ }
+
+ /*
+ * If we don't have a mapping yet and more request flags are
+ * pending then we will be attempting to establish a new virtual
+ * address mapping. Enforce that this mapping is not aliasing
+ * "System RAM"
+ */
+ if (!addr && is_ram == REGION_INTERSECTS && flags) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ if (!addr && (flags & MEMREMAP_WT)) {
+ flags &= ~MEMREMAP_WT;
+ addr = ioremap_wt(offset, size);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+ memunmap(res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+ return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ void **ptr, *addr;
+
+ ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ addr = memremap(offset, size, flags);
+ if (addr) {
+ *ptr = addr;
+ devres_add(dev, ptr);
+ } else
+ devres_free(ptr);
+
+ return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+ WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+ addr));
+ memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
+
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+ struct resource res;
+};
+
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+ struct page_map *page_map = res;
+
+ /* pages are dead and unused, undo the arch mapping */
+ arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+ int is_ram = region_intersects(res->start, resource_size(res),
+ "System RAM");
+ struct page_map *page_map;
+ int error, nid;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, res);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (is_ram == REGION_INTERSECTS)
+ return __va(res->start);
+
+ page_map = devres_alloc(devm_memremap_pages_release,
+ sizeof(*page_map), GFP_KERNEL);
+ if (!page_map)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(&page_map->res, res, sizeof(*res));
+
+ nid = dev_to_node(dev);
+ if (nid < 0)
+ nid = 0;
+
+ error = arch_add_memory(nid, res->start, resource_size(res), true);
+ if (error) {
+ devres_free(page_map);
+ return ERR_PTR(error);
+ }
+
+ devres_add(dev, page_map);
+ return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,8 @@
*/
#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
#include <keys/system_keyring.h>
+#include <crypto/public_key.h>
#include "module-internal.h"
/*
@@ -28,170 +25,22 @@
* - Information block
*/
struct module_signature {
- u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
- u8 id_type; /* Key identifier type [enum pkey_id_type] */
- u8 signer_len; /* Length of signer's name */
- u8 key_id_len; /* Length of key identifier */
+ u8 algo; /* Public-key crypto algorithm [0] */
+ u8 hash; /* Digest algorithm [0] */
+ u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
+ u8 signer_len; /* Length of signer's name [0] */
+ u8 key_id_len; /* Length of key identifier [0] */
u8 __pad[3];
__be32 sig_len; /* Length of signature data */
};
/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
- const void *mod,
- unsigned long modlen)
-{
- struct public_key_signature *pks;
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- size_t digest_size, desc_size;
- int ret;
-
- pr_devel("==>%s()\n", __func__);
-
- /* Allocate the hashing algorithm we're going to need and find out how
- * big the hash operational data will be.
- */
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
- if (IS_ERR(tfm))
- return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- digest_size = crypto_shash_digestsize(tfm);
-
- /* We allocate the hash operational data storage on the end of our
- * context data and the digest output buffer on the end of that.
- */
- ret = -ENOMEM;
- pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
- if (!pks)
- goto error_no_pks;
-
- pks->pkey_hash_algo = hash;
- pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
- pks->digest_size = digest_size;
-
- desc = (void *)pks + sizeof(*pks);
- desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto error;
-
- ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
- if (ret < 0)
- goto error;
-
- crypto_free_shash(tfm);
- pr_devel("<==%s() = ok\n", __func__);
- return pks;
-
-error:
- kfree(pks);
-error_no_pks:
- crypto_free_shash(tfm);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data. This represents the actual
- * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
- const void *data, size_t len)
-{
- size_t nbytes;
- MPI mpi;
-
- if (len < 3)
- return -EBADMSG;
- nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
- data += 2;
- len -= 2;
- if (len != nbytes)
- return -EBADMSG;
-
- mpi = mpi_read_raw_data(data, nbytes);
- if (!mpi)
- return -ENOMEM;
- pks->mpi[0] = mpi;
- pks->nr_mpi = 1;
- return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
- const u8 *key_id, size_t key_id_len)
-{
- key_ref_t key;
- size_t i;
- char *id, *q;
-
- pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
- /* Construct an identifier. */
- id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
- if (!id)
- return ERR_PTR(-ENOKEY);
-
- memcpy(id, signer, signer_len);
-
- q = id + signer_len;
- *q++ = ':';
- *q++ = ' ';
- for (i = 0; i < key_id_len; i++) {
- *q++ = hex_asc[*key_id >> 4];
- *q++ = hex_asc[*key_id++ & 0x0f];
- }
-
- *q = 0;
-
- pr_debug("Look up: \"%s\"\n", id);
-
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
- &key_type_asymmetric, id);
- if (IS_ERR(key))
- pr_warn("Request for unknown module key '%s' err %ld\n",
- id, PTR_ERR(key));
- kfree(id);
-
- if (IS_ERR(key)) {
- switch (PTR_ERR(key)) {
- /* Hide some search errors */
- case -EACCES:
- case -ENOTDIR:
- case -EAGAIN:
- return ERR_PTR(-ENOKEY);
- default:
- return ERR_CAST(key);
- }
- }
-
- pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
- return key_ref_to_ptr(key);
-}
-
-/*
* Verify the signature on a module.
*/
int mod_verify_sig(const void *mod, unsigned long *_modlen)
{
- struct public_key_signature *pks;
struct module_signature ms;
- struct key *key;
- const void *sig;
size_t modlen = *_modlen, sig_len;
- int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
if (sig_len >= modlen)
return -EBADMSG;
modlen -= sig_len;
- if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
- return -EBADMSG;
- modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
*_modlen = modlen;
- sig = mod + modlen;
-
- /* For the moment, only support RSA and X.509 identifiers */
- if (ms.algo != PKEY_ALGO_RSA ||
- ms.id_type != PKEY_ID_X509)
- return -ENOPKG;
- if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
+ if (ms.id_type != PKEY_ID_PKCS7) {
+ pr_err("Module is not signed with expected PKCS#7 message\n");
return -ENOPKG;
-
- key = request_asymmetric_key(sig, ms.signer_len,
- sig + ms.signer_len, ms.key_id_len);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- pks = mod_make_digest(ms.hash, mod, modlen);
- if (IS_ERR(pks)) {
- ret = PTR_ERR(pks);
- goto error_put_key;
}
- ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
- sig_len);
- if (ret < 0)
- goto error_free_pks;
-
- ret = verify_signature(key, pks);
- pr_devel("verify_signature() = %d\n", ret);
+ if (ms.algo != 0 ||
+ ms.hash != 0 ||
+ ms.signer_len != 0 ||
+ ms.key_id_len != 0 ||
+ ms.__pad[0] != 0 ||
+ ms.__pad[1] != 0 ||
+ ms.__pad[2] != 0) {
+ pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+ return -EBADMSG;
+ }
-error_free_pks:
- mpi_free(pks->rsa.s);
- kfree(pks);
-error_put_key:
- key_put(key);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ret;
+ return system_verify_data(mod, modlen, mod + modlen, sig_len,
+ VERIFYING_MODULE_SIGNATURE);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
hb->error = 0;
}
-static void hib_end_io(struct bio *bio, int error)
+static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (!uptodate || error) {
+ if (bio->bi_error) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_iter.bi_sector);
-
- if (!error)
- error = -EIO;
}
if (bio_data_dir(bio) == WRITE)
put_page(page);
- if (error && !hb->error)
- hb->error = error;
+ if (bio->bi_error && !hb->error)
+ hb->error = bio->bi_error;
if (atomic_dec_and_test(&hb->count))
wake_up(&hb->wait);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
int node = cpu_to_mem(cpu);
struct page *page;
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
if (data & ~(unsigned long)PTRACE_O_MASK)
return -EINVAL;
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+ !config_enabled(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
diff --git a/kernel/resource.c b/kernel/resource.c
index fed052a1bc9f..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
*
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @name. Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
*/
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
{
- struct resource *p;
- resource_size_t end = start + size - 1;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- const char *name = "System RAM";
- int ret = -1;
+ resource_size_t end = start + size - 1;
+ int type = 0; int other = 0;
+ struct resource *p;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- if (p->end < start)
- continue;
-
- if (p->start <= start && end <= p->end) {
- /* resource fully contains region */
- if ((p->flags != flags) || strcmp(p->name, name))
- ret = 0;
- else
- ret = 1;
- break;
- }
- if (end < p->start)
- break; /* not found */
+ bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+
+ if (start >= p->start && start <= p->end)
+ is_type ? type++ : other++;
+ if (end >= p->start && end <= p->end)
+ is_type ? type++ : other++;
+ if (p->start >= start && p->end <= end)
+ is_type ? type++ : other++;
}
read_unlock(&resource_lock);
- return ret;
+
+ if (other == 0)
+ return type ? REGION_INTERSECTS : REGION_DISJOINT;
+
+ if (type)
+ return REGION_MIXED;
+
+ return REGION_DISJOINT;
}
void __weak arch_remove_reservations(struct resource *avail)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b864ecee0e1..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_dec(&sched_feat_keys[i]);
+ static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_inc(&sched_feat_keys[i]);
+ static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -8133,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b32b81..5bd4779282df 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(struct seccomp_data *sd)
{
- struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+ lockless_dereference(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
- /* Make sure cross-thread synced filter points somewhere sane. */
- smp_read_barrier_depends();
-
if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (mode == 0)
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return;
+
+ if (mode == SECCOMP_MODE_DISABLED)
return;
else if (mode == SECCOMP_MODE_STRICT)
__secure_computing_strict(this_syscall);
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return SECCOMP_PHASE1_OK;
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
- /* Unpark any threads that were voluntarily parked. */
- for_each_cpu_not(cpu, ht->cpumask) {
- if (cpu_online(cpu)) {
- struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
- kthread_unpark(tsk);
- }
- }
-
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+ cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -311,7 +308,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
{
.procname = "kexec_load_disabled",
.data = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
- __INITRODATA
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
- .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
- .quad __cert_list_end - __cert_list_start
-#else
- .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
- pr_notice("Initialise system trusted keyring\n");
-
- system_trusted_keyring =
- keyring_alloc(".system_keyring",
- KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(system_trusted_keyring))
- panic("Can't allocate system trusted keyring\n");
-
- set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
- return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
- key_ref_t key;
- const u8 *p, *end;
- size_t plen;
-
- pr_notice("Loading compiled-in X.509 certificates\n");
-
- p = system_certificate_list;
- end = p + system_certificate_list_size;
- while (p < end) {
- /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
- * than 256 bytes in size.
- */
- if (end - p < 4)
- goto dodgy_cert;
- if (p[0] != 0x30 &&
- p[1] != 0x82)
- goto dodgy_cert;
- plen = (p[2] << 8) | p[3];
- plen += 4;
- if (plen > end - p)
- goto dodgy_cert;
-
- key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
- "asymmetric",
- NULL,
- p,
- plen,
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ),
- KEY_ALLOC_NOT_IN_QUOTA |
- KEY_ALLOC_TRUSTED);
- if (IS_ERR(key)) {
- pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
- PTR_ERR(key));
- } else {
- set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
- pr_notice("Loaded X.509 cert '%s'\n",
- key_ref_to_ptr(key)->description);
- key_ref_put(key);
- }
- p += plen;
- }
-
- return 0;
-
-dodgy_cert:
- pr_err("Problem parsing in-kernel X.509 certificate list\n");
- return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* This is like the signal handler which runs in kernel mode, but it doesn't
* try to wake up the @task.
*
+ * Note: there is no ordering guarantee on works queued here.
+ *
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
@@ -108,16 +110,6 @@ void task_work_run(void)
raw_spin_unlock_wait(&task->pi_lock);
smp_mb();
- /* Reverse the list to run the works in fifo order */
- head = NULL;
- do {
- next = work->next;
- work->next = head;
- head = work;
- work = next;
- } while (work);
-
- work = head;
do {
next = work->next;
work->func(work);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
if (likely(!bt))
return;
- if (!error && !bio_flagged(bio, BIO_UPTODATE))
- error = EIO;
-
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL);
}
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- !bio_flagged(bio, BIO_UPTODATE),
- sizeof(rpdu), &rpdu);
+ bio->bi_error, sizeof(rpdu), &rpdu);
}
}
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+ bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ sizeof(r), &r);
}
/**
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
/*
* limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
*/
static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
{
char *fmt = (char *) (long) r1;
+ bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
+ u64 unsafe_addr;
+ char buf[64];
int i;
/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
if (fmt[i] == 'l') {
mod[fmt_cnt]++;
i++;
- } else if (fmt[i] == 'p') {
+ } else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
i++;
if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
return -EINVAL;
fmt_cnt++;
+ if (fmt[i - 1] == 's') {
+ if (str_seen)
+ /* allow only one '%s' per fmt string */
+ return -EINVAL;
+ str_seen = true;
+
+ switch (fmt_cnt) {
+ case 1:
+ unsafe_addr = r3;
+ r3 = (long) buf;
+ break;
+ case 2:
+ unsafe_addr = r4;
+ r4 = (long) buf;
+ break;
+ case 3:
+ unsafe_addr = r5;
+ r5 = (long) buf;
+ break;
+ }
+ buf[0] = 0;
+ strncpy_from_unsafe(buf,
+ (void *) (long) unsafe_addr,
+ sizeof(buf));
+ }
continue;
}
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (!event)
+ return -ENOENT;
+
+ /*
+ * we don't know if the function is run successfully by the
+ * return value. It can be judged in other places, such as
+ * eBPF programs.
+ */
+ return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb11011b5292..b0623ac785a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v)
goto out;
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ avg = rec->time;
+ do_div(avg, rec->counter);
+ if (tracing_thresh && (avg < tracing_thresh))
+ goto out;
+#endif
+
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- avg = rec->time;
- do_div(avg, rec->counter);
/* Sample standard deviation (s^2) */
if (rec->counter <= 1)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -400,6 +400,17 @@ struct rb_irq_work {
};
/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+ u64 ts;
+ u64 delta;
+ unsigned long length;
+ struct buffer_page *tail_page;
+ int add_timestamp;
+};
+
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
- unsigned long max_count;
-
- /*
- * We only race with interrupts and NMIs on this CPU.
- * If we own the commit event, then we can commit
- * all others that interrupted us, since the interruptions
- * are in stack format (they finish before they come
- * back to us). This allows us to do a simple loop to
- * assign the commit to the tail.
- */
- again:
- max_count = cpu_buffer->nr_pages * 100;
-
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- if (RB_WARN_ON(cpu_buffer, !(--max_count)))
- return;
- if (RB_WARN_ON(cpu_buffer,
- rb_is_reader_page(cpu_buffer->tail_page)))
- return;
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- /* add barrier to keep gcc from optimizing too much */
- barrier();
- }
- while (rb_commit_index(cpu_buffer) !=
- rb_page_write(cpu_buffer->commit_page)) {
-
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- RB_WARN_ON(cpu_buffer,
- local_read(&cpu_buffer->commit_page->page->commit) &
- ~RB_WRITE_MASK);
- barrier();
- }
-
- /* again, keep gcc from optimizing */
- barrier();
-
- /*
- * If an interrupt came in just after the first while loop
- * and pushed the tail page forward, we will be left with
- * a dangling commit that will never go forward.
- */
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
- goto again;
-}
-
static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
- /* Not the first event on the page? */
- if (rb_event_index(event)) {
- event->time_delta = delta & TS_MASK;
- event->array[0] = delta >> TS_SHIFT;
- } else {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
-
- return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event, unsigned length,
- int add_timestamp, u64 delta)
-{
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
- /*
- * If we need to add a timestamp, then we
- * add it to the start of the resevered space.
- */
- if (unlikely(add_timestamp)) {
- event = rb_add_time_stamp(event, delta);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
-
- event->time_delta = delta;
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
- event->type_len = 0;
- event->array[0] = length;
- } else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static unsigned rb_calculate_event_length(unsigned length)
-{
- struct ring_buffer_event event; /* Used only for sizeof array */
-
- /* zero length can cause confusions */
- if (!length)
- length++;
-
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
- length += sizeof(event.array[0]);
-
- length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
- return length;
-}
-
static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *tail_page,
- unsigned long tail, unsigned long length)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
+ unsigned long length = info->length;
/*
* Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, unsigned long tail,
- struct buffer_page *tail_page, u64 ts)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
+ u64 ts;
next_page = tail_page;
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
out_again:
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
out_reset:
/* reset write */
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
return NULL;
}
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, u64 ts,
- u64 delta, int add_timestamp)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
{
- struct buffer_page *tail_page;
- struct ring_buffer_event *event;
- unsigned long tail, write;
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(add_timestamp))
- length += RB_LEN_TIME_EXTEND;
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
- tail_page = cpu_buffer->tail_page;
- write = local_add_return(length, &tail_page->write);
+ return skip_time_extend(event);
+}
- /* set write to only the index of the write */
- write &= RB_WRITE_MASK;
- tail = write - length;
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event);
+
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event,
+ struct rb_event_info *info)
+{
+ unsigned length = info->length;
+ u64 delta = info->delta;
+
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
/*
- * If this is the first commit on the page, then it has the same
- * timestamp as the page itself.
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
*/
- if (!tail)
+ if (unlikely(info->add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
delta = 0;
+ }
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, length, tail,
- tail_page, ts);
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
- /* We reserved something on the buffer */
+static unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
- event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+ /* zero length can cause confusions */
+ if (!length)
+ length++;
- local_inc(&tail_page->entries);
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
- * If this is the first commit on the page, then update
- * its timestamp.
+ * In case the time delta is larger than the 27 bits for it
+ * in the header, we need to add a timestamp. If another
+ * event comes in when trying to discard this one to increase
+ * the length, then the timestamp will be added in the allocated
+ * space of this event. If length is bigger than the size needed
+ * for the TIME_EXTEND, then padding has to be used. The events
+ * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+ * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+ * As length is a multiple of 4, we only need to worry if it
+ * is 12 (RB_LEN_TIME_EXTEND + 4).
*/
- if (!tail)
- tail_page->page->time_stamp = ts;
+ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+ length += RB_ALIGNMENT;
- /* account for these added bytes */
- local_add(length, &cpu_buffer->entries_bytes);
+ return length;
+}
- return event;
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
}
+#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
local_inc(&cpu_buffer->commits);
}
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long max_count;
+
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ again:
+ max_count = cpu_buffer->nr_pages * 100;
+
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
+ barrier();
+ }
+
+ /* again, keep gcc from optimizing */
+ barrier();
+
+ /*
+ * If an interrupt came in just after the first while loop
+ * and pushed the tail page forward, we will be left with
+ * a dangling commit that will never go forward.
+ */
+ if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ goto again;
+}
+
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
- struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length)
+static inline void rb_event_discard(struct ring_buffer_event *event)
{
- struct ring_buffer_event *event;
- u64 ts, delta;
- int nr_loops = 0;
- int add_timestamp;
- u64 diff;
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
- rb_start_commit(cpu_buffer);
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- /*
- * Due to the ability to swap a cpu buffer from a buffer
- * it is possible it was swapped before we committed.
- * (committing stops a swap). We check for it here and
- * if it happened, we have to fail the write.
- */
- barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
- local_dec(&cpu_buffer->committing);
- local_dec(&cpu_buffer->commits);
- return NULL;
- }
-#endif
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
- length = rb_calculate_event_length(length);
- again:
- add_timestamp = 0;
- delta = 0;
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
/*
- * We allow for interrupts to reenter here and do a trace.
- * If one does, it will cause this original code to loop
- * back here. Even with heavy interrupts happening, this
- * should only happen a few times in a row. If this happens
- * 1000 times in a row, there must be either an interrupt
- * storm or we have something buggy.
- * Bail!
+ * The event first in the commit queue updates the
+ * time stamp.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
- goto out_fail;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
+}
- ts = rb_time_stamp(cpu_buffer->buffer);
- diff = ts - cpu_buffer->write_stamp;
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
+ rb_end_commit(cpu_buffer);
+}
- /* make sure this diff is calculated here */
- barrier();
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ bool pagebusy;
- /* Did the write stamp get updated already? */
- if (likely(ts >= cpu_buffer->write_stamp)) {
- delta = diff;
- if (unlikely(test_time_stamp(delta))) {
- int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- local_clock_stable = sched_clock_stable();
-#endif
- WARN_ONCE(delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)delta,
- (unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp,
- local_clock_stable ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n");
- add_timestamp = 1;
- }
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
}
- event = __rb_reserve_next(cpu_buffer, length, ts,
- delta, add_timestamp);
- if (unlikely(PTR_ERR(event) == -EAGAIN))
- goto again;
-
- if (!event)
- goto out_fail;
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
- return event;
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- out_fail:
- rb_end_commit(cpu_buffer);
- return NULL;
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
}
/*
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
}
/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ rb_wakeups(buffer, cpu_buffer);
+
+ trace_recursive_unlock(cpu_buffer);
+
+ preempt_enable_notrace();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ WARN_ONCE(info->delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)cpu_buffer->write_stamp,
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ info->add_timestamp = 1;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *tail_page;
+ unsigned long tail, write;
+
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(info->add_timestamp))
+ info->length += RB_LEN_TIME_EXTEND;
+
+ tail_page = info->tail_page = cpu_buffer->tail_page;
+ write = local_add_return(info->length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
+ tail = write - info->length;
+
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ info->delta = 0;
+
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE))
+ return rb_move_tail(cpu_buffer, tail, info);
+
+ /* We reserved something on the buffer */
+
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
+ rb_update_event(cpu_buffer, event, info);
+
+ local_inc(&tail_page->entries);
+
+ /*
+ * If this is the first commit on the page, then update
+ * its timestamp.
+ */
+ if (!tail)
+ tail_page->page->time_stamp = info->ts;
+
+ /* account for these added bytes */
+ local_add(info->length, &cpu_buffer->entries_bytes);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ struct ring_buffer_event *event;
+ struct rb_event_info info;
+ int nr_loops = 0;
+ u64 diff;
+
+ rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
+ info.length = rb_calculate_event_length(length);
+ again:
+ info.add_timestamp = 0;
+ info.delta = 0;
+
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+ goto out_fail;
+
+ info.ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = info.ts - cpu_buffer->write_stamp;
+
+ /* make sure this diff is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
+ info.delta = diff;
+ if (unlikely(test_time_stamp(info.delta)))
+ rb_handle_timestamp(cpu_buffer, &info);
+ }
+
+ event = __rb_reserve_next(cpu_buffer, &info);
+
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
+ goto again;
+
+ if (!event)
+ goto out_fail;
+
+ return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
- cpu_buffer->write_stamp += delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
- rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
- bool pagebusy;
-
- if (buffer->irq_work.waiters_pending) {
- buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
- }
-
- if (cpu_buffer->irq_work.waiters_pending) {
- cpu_buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
- if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
- cpu_buffer->irq_work.wakeup_full = true;
- cpu_buffer->irq_work.full_waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- int cpu = raw_smp_processor_id();
-
- cpu_buffer = buffer->buffers[cpu];
-
- rb_commit(cpu_buffer, event);
-
- rb_wakeups(buffer, cpu_buffer);
-
- trace_recursive_unlock(cpu_buffer);
-
- preempt_enable_notrace();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
- event = skip_time_extend(event);
-
- /* array[0] holds the actual length for the discarded event */
- event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
- event->type_len = RINGBUF_TYPE_PADDING;
- /* time delta must be non zero */
- if (!event->time_delta)
- event->time_delta = 1;
-}
-
/*
* Decrement the entries to the page that an event is on.
* The event does not even need to exist, only the pointer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..6e79408674aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (!iter)
return ERR_PTR(-ENOMEM);
- iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
GFP_KERNEL);
if (!iter->buffer_iter)
goto release;
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..7ca09cdc20c2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -30,6 +30,7 @@
DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
+ field = __find_event_field(&ftrace_generic_fields, name);
+ if (field)
+ return field;
+
field = __find_event_field(&ftrace_common_fields, name);
if (field)
return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
}
EXPORT_SYMBOL_GPL(trace_define_field);
+#define __generic_field(type, item, filter_type) \
+ ret = __trace_define_field(&ftrace_generic_fields, #type, \
+ #item, 0, 0, is_signed_type(type), \
+ filter_type); \
+ if (ret) \
+ return ret;
+
#define __common_field(type, item) \
ret = __trace_define_field(&ftrace_common_fields, #type, \
"common_" #item, \
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
if (ret) \
return ret;
+static int trace_define_generic_fields(void)
+{
+ int ret;
+
+ __generic_field(int, cpu, FILTER_OTHER);
+ __generic_field(char *, comm, FILTER_PTR_STRING);
+
+ return ret;
+}
+
static int trace_define_common_fields(void)
{
int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
if (!entry)
pr_warn("Could not create tracefs 'available_events' entry\n");
+ if (trace_define_generic_fields())
+ pr_warn("tracing: Failed to allocated generic fields");
+
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..bd1bf184c5c9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
return match;
}
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+ int cpu, cmp;
+ int match = 0;
+
+ cpu = raw_smp_processor_id();
+ cmp = pred->val;
+
+ switch (pred->op) {
+ case OP_EQ:
+ match = cpu == cmp;
+ break;
+ case OP_LT:
+ match = cpu < cmp;
+ break;
+ case OP_LE:
+ match = cpu <= cmp;
+ break;
+ case OP_GT:
+ match = cpu > cmp;
+ break;
+ case OP_GE:
+ match = cpu >= cmp;
+ break;
+ default:
+ break;
+ }
+
+ return !!match == !pred->not;
+}
+
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+ int cmp, match;
+
+ cmp = pred->regex.match(current->comm, &pred->regex,
+ pred->regex.field_len);
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
if (is_string_field(field)) {
filter_build_regex(pred);
- if (field->filter_type == FILTER_STATIC_STRING) {
+ if (!strcmp(field->name, "comm")) {
+ fn = filter_pred_comm;
+ pred->regex.field_len = TASK_COMM_LEN;
+ } else if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
- fn = select_comparison_fn(pred->op, field->size,
+ if (!strcmp(field->name, "cpu"))
+ fn = filter_pred_cpu;
+ else
+ fn = select_comparison_fn(pred->op, field->size,
field->is_signed);
if (!fn) {
parse_error(ps, FILT_ERR_INVALID_OP, 0);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..ca98445782ac 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
trace_seq_printf(s, ".%s", nsecs_str);
- len += strlen(nsecs_str);
+ len += strlen(nsecs_str) + 1;
}
trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++)
+ for (i = len; i < 8; i++)
trace_seq_putc(s, ' ');
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
- long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
u8 *dst = get_rloc_data(dest);
- u8 *src = addr;
- mm_segment_t old_fs = get_fs();
+ long ret;
if (!maxlen)
return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
* Try to get string again, since the string can be changed while
* probing.
*/
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do
- ret = __copy_from_user_inatomic(dst++, src++, 1);
- while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
+ ret = strncpy_from_unsafe(dst, addr, maxlen);
if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
+ dst[0] = '\0';
*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
} else {
- *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
- get_rloc_offs(*(u32 *)dest));
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
}
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..8e481a84aeea 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -496,6 +496,8 @@ static const struct trace_mark {
char sym;
} mark[] = {
MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(100000000ULL , '@'), /* 100 msec */
+ MARK(10000000ULL , '*'), /* 10 msec */
MARK(1000000ULL , '#'), /* 1000 usecs */
MARK(100000ULL , '!'), /* 100 usecs */
MARK(10000ULL , '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
int size = ARRAY_SIZE(mark);
for (i = 0; i < size; i++) {
- if (d >= mark[i].val)
+ if (d > mark[i].val)
break;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..b746399ab59c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,12 +18,6 @@
#define STACK_TRACE_ENTRIES 500
-#ifdef CC_USING_FENTRY
-# define fentry 1
-#else
-# define fentry 0
-#endif
-
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
*/
static struct stack_trace max_stack_trace = {
.max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[1],
+ .entries = &stack_dump_trace[0],
};
static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ max_stack_trace.nr_entries);
for (i = 0; i < max_stack_trace.nr_entries; i++) {
if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
- int i;
+ int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
max_stack_size = this_size;
max_stack_trace.nr_entries = 0;
-
- if (using_ftrace_ops_list_func())
- max_stack_trace.skip = 4;
- else
- max_stack_trace.skip = 3;
+ max_stack_trace.skip = 3;
save_stack_trace(&max_stack_trace);
- /*
- * Add the passed in ip from the function tracer.
- * Searching for this on the stack will skip over
- * most of the overhead from the stack tracer itself.
- */
- stack_dump_trace[0] = ip;
- max_stack_trace.nr_entries++;
+ /* Skip over the overhead of the stack tracer itself */
+ for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ if (stack_dump_trace[i] == ip)
+ break;
+ }
/*
* Now find where in the stack these are.
*/
- i = 0;
+ x = 0;
start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
while (i < max_stack_trace.nr_entries) {
int found = 0;
- stack_dump_index[i] = this_size;
+ stack_dump_index[x] = this_size;
p = start;
for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
if (*p == stack_dump_trace[i]) {
- this_size = stack_dump_index[i++] =
+ stack_dump_trace[x] = stack_dump_trace[i++];
+ this_size = stack_dump_index[x++] =
(top - p) * sizeof(unsigned long);
found = 1;
/* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
* out what that is, then figure it out
* now.
*/
- if (unlikely(!tracer_frame) && i == 1) {
+ if (unlikely(!tracer_frame)) {
tracer_frame = (p - stack) *
sizeof(unsigned long);
max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+ max_stack_trace.nr_entries = x;
+ for (; x < i; x++)
+ stack_dump_trace[x] = ULONG_MAX;
+
if (task_stack_end_corrupted(current)) {
print_max_stack();
BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- /*
- * When fentry is used, the traced function does not get
- * its stack frame set up, and we lose the parent.
- * The ip is pretty useless because the function tracer
- * was called before that function set up its stack frame.
- * In this case, we use the parent ip.
- *
- * By adding the return address of either the parent ip
- * or the current ip we can disregard most of the stack usage
- * caused by the stack tracer itself.
- *
- * The function tracer always reports the address of where the
- * mcount call was, but the stack will hold the return address.
- */
- if (fentry)
- ip = parent_ip;
- else
- ip += MCOUNT_INSN_SIZE;
+ ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ max_stack_trace.nr_entries);
if (!stack_tracer_enabled && !max_stack_size)
print_disabled(m);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
+#include <linux/kthread.h>
/*
* The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
}
-void watchdog_nmi_enable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_enable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!watchdog_running)
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_disable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
- int ret;
+ int cpu, ret = 0;
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu) {
+ ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+ if (ret)
+ break;
+ }
+ if (ret) {
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
+ put_online_cpus();
+
+ return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ mutex_lock(&watchdog_proc_mutex);
/*
- * No need to cancel and restart hrtimer if it is currently executing
- * because it will reprogram itself with the new period now.
- * We should never see it unqueued here because we are running per-cpu
- * with interrupts disabled.
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
*/
- ret = hrtimer_try_to_cancel(hrtimer);
- if (ret == 1)
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
}
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
/*
- * Make sure that perf event counter will adopt to a new
- * sampling period. Updating the sampling period directly would
- * be much nicer but we do not have an API for that now so
- * let's use a big hammer.
- * Hrtimer will adopt the new period on the next tick but this
- * might be late already so we have to restart the timer as well.
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
*/
- watchdog_nmi_disable(cpu);
- smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
- watchdog_nmi_enable(cpu);
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ mutex_unlock(&watchdog_proc_mutex);
}
static void update_watchdog_all_cpus(void)
{
- int cpu;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- update_watchdog(cpu);
- put_online_cpus();
+ watchdog_park_threads();
+ watchdog_unpark_threads();
}
static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else {
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask))
- pr_err("Failed to set cpumask for watchdog threads\n");
+ else
watchdog_running = 1;
- }
} else {
/*
* Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
/*
* If the parameter is being read return the state of the corresponding
* bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
old = ACCESS_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
int err;
mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write) {
/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
pr_err("cpumask update failed\n");
}
}
+out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
- if (!cpumask_empty(tick_nohz_full_mask))
- pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
- tick_nohz_full_mask);
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 811edb77dd6d..ca71582fcfab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2612,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue