diff options
Diffstat (limited to 'kernel')
67 files changed, 5076 insertions, 4277 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..d4988410b410 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o +obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o @@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o @@ -99,6 +101,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_HAS_IOMEM) += memremap.o + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. @@ -111,99 +115,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) - -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. -# -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. -# -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 -X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) -X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif - -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") - -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) - -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ -endif - -clean-files := x509_certificate_list .x509.list - -ifeq ($(CONFIG_MODULE_SIG),y) -############################################################################### -# -# If module signing is requested, say by allyesconfig, but a key has not been -# supplied, then one will need to be generated to make sure the build does not -# fail and that the kernel may be used afterwards. -# -############################################################################### -ifndef CONFIG_MODULE_SIG_HASH -$(error Could not determine digest type to use from kernel config) -endif - -signing_key.priv signing_key.x509: x509.genkey - @echo "###" - @echo "### Now generating an X.509 key pair to be used for signing modules." - @echo "###" - @echo "### If this takes a long time, you might wish to run rngd in the" - @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and uses a hardware random" - @echo "### number generator if one is available." - @echo "###" - openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ - -batch -x509 -config x509.genkey \ - -outform DER -out signing_key.x509 \ - -keyout signing_key.priv 2>&1 - @echo "###" - @echo "### Key pair generated." - @echo "###" - -x509.genkey: - @echo Generating X.509 key generation config - @echo >x509.genkey "[ req ]" - @echo >>x509.genkey "default_bits = 4096" - @echo >>x509.genkey "distinguished_name = req_distinguished_name" - @echo >>x509.genkey "prompt = no" - @echo >>x509.genkey "string_mask = utf8only" - @echo >>x509.genkey "x509_extensions = myexts" - @echo >>x509.genkey - @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "#O = Unspecified company" - @echo >>x509.genkey "CN = Build time autogenerated kernel key" - @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" - @echo >>x509.genkey - @echo >>x509.genkey "[ myexts ]" - @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" - @echo >>x509.genkey "keyUsage=digitalSignature" - @echo >>x509.genkey "subjectKeyIdentifier=hash" - @echo >>x509.genkey "authorityKeyIdentifier=keyid" -endif diff --git a/kernel/audit.c b/kernel/audit.c index f9e6065346db..662c007635fb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) + if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", diff --git a/kernel/audit.h b/kernel/audit.h index d641f9bb3ed0..dadf86a0e59e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -50,6 +50,7 @@ enum audit_state { /* Rule lists */ struct audit_watch; +struct audit_fsnotify_mark; struct audit_tree; struct audit_chunk; @@ -252,6 +253,7 @@ struct audit_net { extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; +extern int audit_del_rule(struct audit_entry *); extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; @@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); + +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern char *audit_mark_path(struct audit_fsnotify_mark *mark); +extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); +extern void audit_remove_mark_rule(struct audit_krule *krule); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); + #else #define audit_put_watch(w) {} #define audit_get_watch(w) {} @@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 +#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_mark_path(m) "" +#define audit_remove_mark(m) +#define audit_remove_mark_rule(k) +#define audit_mark_compare(m, i, d) 0 +#define audit_exe_compare(t, m) (-EINVAL) +#define audit_dupe_exe(n, o) (-EINVAL) #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c new file mode 100644 index 000000000000..27c6046c2c3d --- /dev/null +++ b/kernel/audit_fsnotify.c @@ -0,0 +1,216 @@ +/* audit_fsnotify.c -- tracking inodes + * + * Copyright 2003-2009,2014-2015 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/fsnotify_backend.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/security.h> +#include "audit.h" + +/* + * this mark lives on the parent directory of the inode in question. + * but dev, ino, and path are about the child + */ +struct audit_fsnotify_mark { + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + char *path; /* insertion path */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ + struct audit_krule *rule; +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_fsnotify_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) +{ + kfree(audit_mark->path); + kfree(audit_mark); +} + +static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct audit_fsnotify_mark *audit_mark; + + audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); + audit_fsnotify_mark_free(audit_mark); +} + +char *audit_mark_path(struct audit_fsnotify_mark *mark) +{ + return mark->path; +} + +int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +{ + if (mark->ino == AUDIT_INO_UNSET) + return 0; + return (mark->ino == ino) && (mark->dev == dev); +} + +static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, + struct inode *inode) +{ + audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; +} + +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +{ + struct audit_fsnotify_mark *audit_mark; + struct path path; + struct dentry *dentry; + struct inode *inode; + int ret; + + if (pathname[0] != '/' || pathname[len-1] == '/') + return ERR_PTR(-EINVAL); + + dentry = kern_path_locked(pathname, &path); + if (IS_ERR(dentry)) + return (void *)dentry; /* returning an error */ + inode = path.dentry->d_inode; + mutex_unlock(&inode->i_mutex); + + audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); + if (unlikely(!audit_mark)) { + audit_mark = ERR_PTR(-ENOMEM); + goto out; + } + + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + audit_mark->mark.mask = AUDIT_FS_EVENTS; + audit_mark->path = pathname; + audit_update_mark(audit_mark, dentry->d_inode); + audit_mark->rule = krule; + + ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + if (ret < 0) { + audit_fsnotify_mark_free(audit_mark); + audit_mark = ERR_PTR(ret); + } +out: + dput(dentry); + path_put(&path); + return audit_mark; +} + +static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) +{ + struct audit_buffer *ab; + struct audit_krule *rule = audit_mark->rule; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "auid=%u ses=%u op=", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, audit_mark->path); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + +void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) +{ + fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); + fsnotify_put_mark(&audit_mark->mark); +} + +void audit_remove_mark_rule(struct audit_krule *krule) +{ + struct audit_fsnotify_mark *mark = krule->exe; + + audit_remove_mark(mark); +} + +static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) +{ + struct audit_krule *rule = audit_mark->rule; + struct audit_entry *entry = container_of(rule, struct audit_entry, rule); + + audit_mark_log_rule_change(audit_mark, "autoremove_rule"); + audit_del_rule(entry); +} + +/* Update mark data in audit rules based on fsnotify events. */ +static int audit_mark_handle_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) +{ + struct audit_fsnotify_mark *audit_mark; + struct inode *inode = NULL; + + audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); + + BUG_ON(group != audit_fsnotify_group); + + switch (data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = ((struct path *)data)->dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = (struct inode *)data; + break; + default: + BUG(); + return 0; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + return 0; + audit_update_mark(audit_mark, inode); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_autoremove_mark_rule(audit_mark); + + return 0; +} + +static const struct fsnotify_ops audit_mark_fsnotify_ops = { + .handle_event = audit_mark_handle_event, +}; + +static int __init audit_fsnotify_init(void) +{ + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + if (IS_ERR(audit_fsnotify_group)) { + audit_fsnotify_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_fsnotify_init); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0f9877273fc..94ecdabda8e6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ audit_tree_log_remove_rule(rule); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6e30024d9aac..656c7e93ac0d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return (watch->ino != (unsigned long)-1) && + return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && (watch->dev == dev); } @@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) INIT_LIST_HEAD(&watch->rules); atomic_set(&watch->count, 1); watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; + watch->dev = AUDIT_DEV_UNSET; + watch->ino = AUDIT_INO_UNSET; return watch; } @@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) if (IS_ERR(watch)) return PTR_ERR(watch); - audit_get_watch(watch); krule->watch = watch; return 0; @@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, list_replace(&oentry->rule.list, &nentry->rule.list); } + if (oentry->rule.exe) + audit_remove_mark(oentry->rule.exe); audit_watch_log_rule_change(r, owatch, "updated_rules"); @@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); audit_watch_log_rule_change(r, w, "remove_rule"); + if (e->rule.exe) + audit_remove_mark(e->rule.exe); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); @@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, watch_found = 1; - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); + /* put krule's ref to temporary watch */ audit_put_watch(watch); audit_get_watch(w); krule->watch = watch = w; + + audit_put_parent(parent); break; } if (!watch_found) { - audit_get_parent(parent); watch->parent = parent; + audit_get_watch(watch); list_add(&watch->wlist, &parent->watches); } list_add(&krule->rlist, &watch->rules); @@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: @@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -517,3 +518,36 @@ static int __init audit_watch_init(void) return 0; } device_initcall(audit_watch_init); + +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +{ + struct audit_fsnotify_mark *audit_mark; + char *pathname; + + pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + if (IS_ERR(audit_mark)) { + kfree(pathname); + return PTR_ERR(audit_mark); + } + new->exe = audit_mark; + + return 0; +} + +int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) +{ + struct file *exe_file; + unsigned long ino; + dev_t dev; + + rcu_read_lock(); + exe_file = rcu_dereference(tsk->mm->exe_file); + ino = exe_file->f_inode->i_ino; + dev = exe_file->f_inode->i_sb->s_dev; + rcu_read_unlock(); + return audit_mark_compare(mark, ino, dev); +} diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 72e1660a79a3..7714d93edb85 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_EXE: + if (f->op != Audit_equal) + return -EINVAL; + if (entry->rule.listnr != AUDIT_FILTER_EXIT) + return -EINVAL; + break; }; return 0; } @@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; + struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) @@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_EXE: + if (entry->rule.exe || f->val > PATH_MAX) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) { + err = PTR_ERR(str); + goto exit_free; + } + entry->rule.buflen += f->val; + + audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + if (IS_ERR(audit_mark)) { + kfree(str); + err = PTR_ERR(audit_mark); + goto exit_free; + } + entry->rule.exe = audit_mark; + break; } } @@ -549,10 +574,10 @@ exit_nofree: return entry; exit_free: - if (entry->rule.watch) - audit_put_watch(entry->rule.watch); /* matches initial get */ if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } @@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_EXE: + data->buflen += data->values[i] = + audit_pack_string(&bufp, audit_mark_path(krule->exe)); + break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; @@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_EXE: + /* both paths exist based on above type compare */ + if (strcmp(audit_mark_path(a->exe), + audit_mark_path(b->exe))) + return 1; + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) err = -ENOMEM; else new->filterkey = fk; + break; + case AUDIT_EXE: + err = audit_dupe_exe(new, old); + break; } if (err) { + if (new->exe) + audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } @@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int err; + int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); - goto error; + return err; } if (watch) { @@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) */ if (tree) audit_put_tree(tree); - goto error; + return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); - goto error; + return err; } } @@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ return err; } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) +int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; @@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { - mutex_unlock(&audit_filter_mutex); ret = -ENOENT; goto out; } @@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) if (e->rule.tree) audit_remove_tree_rule(&e->rule); - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); + if (e->rule.exe) + audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) @@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) if (!audit_match_signal(entry)) audit_signals--; #endif - mutex_unlock(&audit_filter_mutex); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); out: - if (watch) - audit_put_watch(watch); /* match initial get */ + mutex_unlock(&audit_filter_mutex); + if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, WARN_ON(1); } - if (err || type == AUDIT_DEL_RULE) + if (err || type == AUDIT_DEL_RULE) { + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); audit_free_rule(entry); + } return err; } @@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) return 0; nentry = audit_dupe_rule(r); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e85bdfd15fed..b86cc04959de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) return 0; list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && + if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } @@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(ctx->ppid, f->op, f->val); } break; + case AUDIT_EXE: + result = audit_exe_compare(tsk, rule->exe); + break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; @@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, aname->should_free = true; } - aname->ino = (unsigned long)-1; + aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); @@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, if (inode) audit_copy_inode(found_child, dentry, inode); else - found_child->ino = (unsigned long)-1; + found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229a6fa4..29ace107f236 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) - BUG_ON(array->prog[i] != NULL); + BUG_ON(array->ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (!bpf_prog_array_compatible(array, prog)) { - bpf_prog_put(prog); - return -EINVAL; - } + new_ptr = map->ops->map_fd_get_ptr(map, ufd); + if (IS_ERR(new_ptr)) + return PTR_ERR(new_ptr); - old_prog = xchg(array->prog + index, prog); - if (old_prog) - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, new_ptr); + if (old_ptr) + map->ops->map_fd_put_ptr(old_ptr); return 0; } -static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *old_prog; + void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; - old_prog = xchg(array->prog + index, NULL); - if (old_prog) { - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, NULL); + if (old_ptr) { + map->ops->map_fd_put_ptr(old_ptr); return 0; } else { return -ENOENT; } } +static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) + return prog; + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + return prog; +} + +static void prog_fd_array_put_ptr(void *ptr) +{ + struct bpf_prog *prog = ptr; + + bpf_prog_put_rcu(prog); +} + /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_prog_array_map_clear(struct bpf_map *map) +void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - prog_array_map_delete_elem(map, &i); + fd_array_map_delete_elem(map, &i); } static const struct bpf_map_ops prog_array_ops = { - .map_alloc = prog_array_map_alloc, - .map_free = prog_array_map_free, + .map_alloc = fd_array_map_alloc, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, - .map_lookup_elem = prog_array_map_lookup_elem, - .map_update_elem = prog_array_map_update_elem, - .map_delete_elem = prog_array_map_delete_elem, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = prog_fd_array_get_ptr, + .map_fd_put_ptr = prog_fd_array_put_ptr, }; static struct bpf_map_type_list prog_array_type __read_mostly = { @@ -255,3 +273,60 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + return (void *)attr; + + if (attr->type != PERF_TYPE_RAW && + attr->type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = &perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(&perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c5bedc82bc1c..67c380cfa9ca 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } +EXPORT_SYMBOL_GPL(__bpf_call_base); /** * __bpf_prog_run - run eBPF program on a given context @@ -449,11 +450,15 @@ select_insn: tail_call_cnt++; - prog = READ_ONCE(array->prog[index]); + prog = READ_ONCE(array->ptrs[index]); if (unlikely(!prog)) goto out; - ARG1 = BPF_R1; + /* ARG1 at this point is guaranteed to point to CTX from + * the verifier side due to the fact that the tail call is + * handeled like a helper, that is, bpf_tail_call_proto, + * where arg1_type is ARG_PTR_TO_CTX. + */ insn = prog->insnsi; goto select_insn; out: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1b14d197a4f..35bac8e8b071 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) /* prog_array stores refcnt-ed bpf_prog pointers * release them all when user space closes prog_array_fd */ - bpf_prog_array_map_clear(map); + bpf_fd_array_map_clear(map); bpf_map_put(map); return 0; @@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value, *ptr; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_ptr(attr->key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; + struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *unext_key = u64_to_ptr(attr->next_key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *next_key; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866fd36a..b074b23000d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -275,7 +283,7 @@ static const char *const bpf_class_string[] = { [BPF_ALU64] = "alu64", }; -static const char *const bpf_alu_string[] = { +static const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", @@ -299,7 +307,7 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; -static const char *const bpf_jmp_string[] = { +static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", @@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, struct verifier_state *state = &env->cur_state; int size, err = 0; + if (state->regs[regno].type == PTR_TO_STACK) + off += state->regs[regno].imm; + size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; @@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == FRAME_PTR) { + } else if (state->regs[regno].type == FRAME_PTR || + state->regs[regno].type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i < ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: - * only allow to pass it into bpf_tail_call() for now. - * bpf_map_delete_elem() can be allowed in the future, - * while bpf_map_update_elem() must only be done via syscall - */ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call && - map->map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into - * bpf_tail_call() - */ - return -EINVAL; + err = check_map_func_compatibility(map, func_id); + if (err) + return err; return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b89f3168411b..2cf0f79f1fc9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = { * part of that cgroup. */ struct cgroup_root cgrp_dfl_root; +EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the @@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1; static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; +/* Ditto for the can_fork callback. */ +static unsigned long have_canfork_callback __read_mostly; + static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; @@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { + struct cgroup_subsys *ss = cft->ss; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", - cft->ss->name, cft->name); + cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, + cft->name); else strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; @@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq, struct cgroup_subsys *ss; int ssid; - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_printf(seq, ",%s", ss->name); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1342,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq, spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); return 0; } @@ -1447,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) } for_each_subsys(ss, i) { - if (strcmp(token, ss->name)) + if (strcmp(token, ss->legacy_name)) continue; if (ss->disabled) continue; @@ -1666,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4579,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, if (err) goto err_free_css; - err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_percpu_ref; css->id = err; @@ -4656,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; goto out_cancel_ref; @@ -4955,6 +4964,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; + have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -4993,6 +5003,8 @@ int __init cgroup_init_early(void) ss->id = i; ss->name = cgroup_subsys_name[i]; + if (!ss->legacy_name) + ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); @@ -5136,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, continue; seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", + ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); @@ -5178,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->name, ss->root->hierarchy_id, + ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); @@ -5197,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; +static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) + return &ss_priv[i - CGROUP_CANFORK_START]; + return NULL; +} + +static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + void **private = subsys_canfork_priv_p(ss_priv, i); + return private ? *private : NULL; +} + /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5212,6 +5239,57 @@ void cgroup_fork(struct task_struct *child) } /** + * cgroup_can_fork - called on a new task before the process is exposed + * @child: the task in question. + * + * This calls the subsystem can_fork() callbacks. If the can_fork() callback + * returns an error, the fork aborts with that error code. This allows for + * a cgroup subsystem to conditionally allow or deny new forks. + */ +int cgroup_can_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i, j, ret; + + for_each_subsys_which(ss, i, &have_canfork_callback) { + ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + if (ret) + goto out_revert; + } + + return 0; + +out_revert: + for_each_subsys(ss, j) { + if (j >= i) + break; + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + } + + return ret; +} + +/** + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the task in question + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded. + */ +void cgroup_cancel_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i; + + for_each_subsys(ss, i) + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); +} + +/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * @@ -5221,7 +5299,8 @@ void cgroup_fork(struct task_struct *child) * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, + void *old_ss_priv[CGROUP_CANFORK_COUNT]) { struct cgroup_subsys *ss; int i; @@ -5266,7 +5345,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child); + ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); } /** @@ -5400,12 +5479,14 @@ static int __init cgroup_disable(char *str) continue; for_each_subsys(ss, i) { - if (!strcmp(token, ss->name)) { - ss->disabled = 1; - printk(KERN_INFO "Disabling %s control group" - " subsystem\n", ss->name); - break; - } + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + ss->disabled = 1; + printk(KERN_INFO "Disabling %s control group subsystem\n", + ss->name); + break; } } return 1; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 92b98cc0ee76..f1b30ad5dc6d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task) +static void freezer_fork(struct task_struct *task, void *private) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..806cd7693ac8 --- /dev/null +++ b/kernel/cgroup_pids.c @@ -0,0 +1,355 @@ +/* + * Process number limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop any new processes from fork()ing + * after a certain limit is reached. + * + * Since it is trivial to hit the task limit without hitting any kmemcg limits + * in place, PIDs are a fundamental resource. As such, PID exhaustion must be + * preventable in the scope of a cgroup hierarchy by allowing resource limiting + * of the number of tasks in a cgroup. + * + * In order to use the `pids` controller, set the maximum number of tasks in + * pids.max (this is not available in the root cgroup for obvious reasons). The + * number of processes currently in the cgroup is given by pids.current. + * Organisational operations are not blocked by cgroup policies, so it is + * possible to have pids.current > pids.max. However, it is not possible to + * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking + * would cause a cgroup policy to be violated. + * + * To set a cgroup to have no limit, set pids.max to "max". This is the default + * for all new cgroups (N.B. that PID limits are hierarchical, so the most + * stringent limit in the hierarchy is followed). + * + * pids.current tracks all child cgroup hierarchies, so parent/pids.current is + * a superset of parent/child/pids.current. + * + * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> + +#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) +#define PIDS_MAX_STR "max" + +struct pids_cgroup { + struct cgroup_subsys_state css; + + /* + * Use 64-bit types so that we can safely represent "max" as + * %PIDS_MAX = (%PID_MAX_LIMIT + 1). + */ + atomic64_t counter; + int64_t limit; +}; + +static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) +{ + return container_of(css, struct pids_cgroup, css); +} + +static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) +{ + return css_pids(pids->css.parent); +} + +static struct cgroup_subsys_state * +pids_css_alloc(struct cgroup_subsys_state *parent) +{ + struct pids_cgroup *pids; + + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); + if (!pids) + return ERR_PTR(-ENOMEM); + + pids->limit = PIDS_MAX; + atomic64_set(&pids->counter, 0); + return &pids->css; +} + +static void pids_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_pids(css)); +} + +/** + * pids_cancel - uncharge the local pid count + * @pids: the pid cgroup state + * @num: the number of pids to cancel + * + * This function will WARN if the pid count goes under 0, because such a case is + * a bug in the pids controller proper. + */ +static void pids_cancel(struct pids_cgroup *pids, int num) +{ + /* + * A negative count (or overflow for that matter) is invalid, + * and indicates a bug in the `pids` controller proper. + */ + WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); +} + +/** + * pids_uncharge - hierarchically uncharge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to uncharge + */ +static void pids_uncharge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + pids_cancel(p, num); +} + +/** + * pids_charge - hierarchically charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function does *not* follow the pid limit set. It cannot fail and the new + * pid count may exceed the limit. This is only used for reverting failed + * attaches, where there is no other way out than violating the limit. + */ +static void pids_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + atomic64_add(num, &p->counter); +} + +/** + * pids_try_charge - hierarchically try to charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function follows the set limit. It will fail if the charge would cause + * the new value to exceed the hierarchical limit. Returns 0 if the charge + * succeded, otherwise -EAGAIN. + */ +static int pids_try_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p, *q; + + for (p = pids; p; p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + /* + * Since new is capped to the maximum number of pid_t, if + * p->limit is %PIDS_MAX then we know that this test will never + * fail. + */ + if (new > p->limit) + goto revert; + } + + return 0; + +revert: + for (q = pids; q != p; q = parent_pids(q)) + pids_cancel(q, num); + pids_cancel(p, num); + + return -EAGAIN; +} + +static int pids_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + /* + * No need to pin @old_css between here and cancel_attach() + * because cgroup core protects it from being freed before + * the migration completes or fails. + */ + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(pids, 1); + pids_uncharge(old_pids, 1); + } + + return 0; +} + +static void pids_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(old_pids, 1); + pids_uncharge(pids, 1); + } +} + +static int pids_can_fork(struct task_struct *task, void **priv_p) +{ + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + int err; + + /* + * Use the "current" task_css for the pids subsystem as the tentative + * css. It is possible we will charge the wrong hierarchy, in which + * case we will forcefully revert/reapply the charge on the right + * hierarchy after it is committed to the task proper. + */ + css = task_get_css(current, pids_cgrp_id); + pids = css_pids(css); + + err = pids_try_charge(pids, 1); + if (err) + goto err_css_put; + + *priv_p = css; + return 0; + +err_css_put: + css_put(css); + return err; +} + +static void pids_cancel_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css = priv; + struct pids_cgroup *pids = css_pids(css); + + pids_uncharge(pids, 1); + css_put(css); +} + +static void pids_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css; + struct cgroup_subsys_state *old_css = priv; + struct pids_cgroup *pids; + struct pids_cgroup *old_pids = css_pids(old_css); + + css = task_get_css(task, pids_cgrp_id); + pids = css_pids(css); + + /* + * If the association has changed, we have to revert and reapply the + * charge/uncharge on the wrong hierarchy to the current one. Since + * the association can only change due to an organisation event, its + * okay for us to ignore the limit in this case. + */ + if (pids != old_pids) { + pids_uncharge(old_pids, 1); + pids_charge(pids, 1); + } + + css_put(css); + css_put(old_css); +} + +static void pids_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) +{ + struct pids_cgroup *pids = css_pids(old_css); + + pids_uncharge(pids, 1); +} + +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct pids_cgroup *pids = css_pids(css); + int64_t limit; + int err; + + buf = strstrip(buf); + if (!strcmp(buf, PIDS_MAX_STR)) { + limit = PIDS_MAX; + goto set_limit; + } + + err = kstrtoll(buf, 0, &limit); + if (err) + return err; + + if (limit < 0 || limit >= PIDS_MAX) + return -EINVAL; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + pids->limit = limit; + return nbytes; +} + +static int pids_max_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct pids_cgroup *pids = css_pids(css); + int64_t limit = pids->limit; + + if (limit >= PIDS_MAX) + seq_printf(sf, "%s\n", PIDS_MAX_STR); + else + seq_printf(sf, "%lld\n", limit); + + return 0; +} + +static s64 pids_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return atomic64_read(&pids->counter); +} + +static struct cftype pids_files[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys pids_cgrp_subsys = { + .css_alloc = pids_css_alloc, + .css_free = pids_css_free, + .can_attach = pids_can_attach, + .cancel_attach = pids_cancel_attach, + .can_fork = pids_can_fork, + .cancel_fork = pids_cancel_fork, + .fork = pids_fork, + .exit = pids_exit, + .legacy_cftypes = pids_files, + .dfl_cftypes = pids_files, +}; diff --git a/kernel/cred.c b/kernel/cred.c index ec1c07667ec1..71179a09c1d6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -20,11 +20,16 @@ #include <linux/cn_proc.h> #if 0 -#define kdebug(FMT, ...) \ - printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__) #else -#define kdebug(FMT, ...) \ - no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ +do { \ + if (0) \ + no_printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__); \ +} while (0) #endif static struct kmem_cache *cred_jar; diff --git a/kernel/events/core.c b/kernel/events/core.c index ae16867670a9..f548f69c4299 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3222,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) +{ + unsigned long flags; + u64 val; + + /* + * Disabling interrupts avoids all counter scheduling (context + * switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()); + + /* + * It must not be an event with inherit set, we cannot read + * all child counters from atomic context. + */ + WARN_ON_ONCE(event->attr.inherit); + + /* + * It must not have a pmu::count method, those are not + * NMI safe. + */ + WARN_ON_ONCE(event->pmu->count); + + /* + * If the event is currently on this CPU, its either a per-task event, + * or local to this CPU. Furthermore it means its ACTIVE (otherwise + * oncpu == -1). + */ + if (event->oncpu == smp_processor_id()) + event->pmu->read(event); + + val = local64_read(&event->count); + local_irq_restore(flags); + + return val; +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -8718,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, &f); + if (err) + return ERR_PTR(err); + + event = f.file->private_data; + atomic_long_inc(&event->refcount); + fdput(f); + + return event; +} + +const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + if (!event) + return ERR_PTR(-EINVAL); + + return &event->attr; +} + /* * inherit a event from parent task to child task: */ @@ -9016,7 +9094,7 @@ static void perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; diff --git a/kernel/extable.c b/kernel/extable.c index c98f926277a8..e820ccee9846 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -18,7 +18,6 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> -#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> diff --git a/kernel/fork.c b/kernel/fork.c index 2b1a61cddc19..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; + tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -1246,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; + void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1518,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->task_works = NULL; /* + * Ensure that the cgroup subsystem policies allow the new process to be + * forked. It should be noted the the new process's css_set can be changed + * between here and cgroup_post_fork() if an organisation operation is in + * progress. + */ + retval = cgroup_can_fork(p, cgrp_ss_priv); + if (retval) + goto bad_fork_free_pid; + + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ @@ -1553,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_cancel_cgroup; } if (likely(p->pid)) { @@ -1595,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, cgrp_ss_priv); if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); @@ -1605,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; +bad_fork_cancel_cgroup: + cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); diff --git a/kernel/futex.c b/kernel/futex.c index c4a182f5357e..6e443efc65f4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -64,6 +64,7 @@ #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> +#include <linux/fault-inject.h> #include <asm/futex.h> @@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize; static struct futex_hash_bucket *futex_queues; +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + u32 ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = 0, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ + static inline void futex_get_mm(union futex_key *key) { atomic_inc(&key->private.mm->mm_count); @@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try @@ -516,7 +584,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (ro) { + if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } @@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { u32 uninitialized_var(curval); + if (unlikely(should_fail_futex(true))) + return -EFAULT; + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; @@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Detect deadlocks. */ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + /* * Lookup existing state first. If it exists, try to attach to * its pi_state. @@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) @@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, @@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart) /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. */ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) @@ -2300,6 +2386,10 @@ retry_private: ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ switch (ret) { case 1: /* We got the lock. */ @@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. + * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all @@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52ebaca1b9fc..f7dd15d537f9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct static_key *key, int enable); +static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { @@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_ENABLE); - else - jump_label_update(key, JUMP_LABEL_DISABLE); - } - atomic_inc(&key->enabled); + if (atomic_inc_return(&key->enabled) == 1) + jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); } else { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_DISABLE); - else - jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_update(key); } jump_label_unlock(); } @@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* +/* * Update code which is definitely not currently executing. * Architectures which need heavyweight synchronization to modify * running code can override this to make the non-live update case @@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - arch_jump_label_transform(entry, type); + arch_jump_label_transform(entry, type); +} + +static inline struct jump_entry *static_key_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); +} + +static inline bool static_key_type(struct static_key *key) +{ + return (unsigned long)key->entries & JUMP_TYPE_MASK; +} + +static inline struct static_key *jump_entry_key(struct jump_entry *entry) +{ + return (struct static_key *)((unsigned long)entry->key & ~1UL); +} + +static bool jump_entry_branch(struct jump_entry *entry) +{ + return (unsigned long)entry->key & 1UL; +} + +static enum jump_label_type jump_label_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool enabled = static_key_enabled(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return enabled ^ branch; } static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop, int enable) + struct jump_entry *stop) { - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel * init code, see jump_label_invalidate_module_init(). */ if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); + arch_jump_label_transform(entry, jump_label_type(entry)); } } -static enum jump_label_type jump_label_type(struct static_key *key) -{ - bool true_branch = jump_label_get_branch_default(key); - bool state = static_key_enabled(key); - - if ((!true_branch && state) || (true_branch && !state)) - return JUMP_LABEL_ENABLE; - - return JUMP_LABEL_DISABLE; -} - void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -202,8 +211,11 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_type(iterk)); + /* rewrite NOPs */ + if (jump_label_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -222,6 +234,16 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES +static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool type = static_key_type(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return type ^ branch; +} + struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; @@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct static_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key) { - struct static_key_mod *mod = key->next; + struct static_key_mod *mod; - while (mod) { + for (mod = key->next; mod; mod = mod->next) { struct module *m = mod->mod; __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; + m->jump_entries + m->num_jump_entries); } } @@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + /* Only write NOPs for arch_branch_static(). */ + if (jump_label_init_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); } } @@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) jlm->next = key->next; key->next = jlm; - if (jump_label_type(key) == JUMP_LABEL_ENABLE) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + /* Only update if we've changed from our initial state */ + if (jump_label_type(iter) != jump_label_init_type(iter)) + __jump_label_update(key, iter, iter_stop); } return 0; @@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod) struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) + if (jump_entry_key(iter) == key) continue; - key = (struct static_key *)(unsigned long)iter->key; + key = jump_entry_key(iter); if (within_module(iter->key, mod)) continue; @@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct static_key *key, int enable) +static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = jump_label_get_entries(key); + struct jump_entry *entry = static_key_entries(key); #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key, enable); + __jump_label_mod_update(key); preempt_disable(); mod = __module_address((unsigned long)key); @@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable) #endif /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, enable); + __jump_label_update(key, entry, stop); } -#endif +#ifdef CONFIG_STATIC_KEYS_SELFTEST +static DEFINE_STATIC_KEY_TRUE(sk_true); +static DEFINE_STATIC_KEY_FALSE(sk_false); + +static __init int jump_label_test(void) +{ + int i; + + for (i = 0; i < 2; i++) { + WARN_ON(static_key_enabled(&sk_true.key) != true); + WARN_ON(static_key_enabled(&sk_false.key) != false); + + WARN_ON(!static_branch_likely(&sk_true)); + WARN_ON(!static_branch_unlikely(&sk_true)); + WARN_ON(static_branch_likely(&sk_false)); + WARN_ON(static_branch_unlikely(&sk_false)); + + static_branch_disable(&sk_true); + static_branch_enable(&sk_false); + + WARN_ON(static_key_enabled(&sk_true.key) == true); + WARN_ON(static_key_enabled(&sk_false.key) == false); + + WARN_ON(static_branch_likely(&sk_true)); + WARN_ON(static_branch_unlikely(&sk_true)); + WARN_ON(!static_branch_likely(&sk_false)); + WARN_ON(!static_branch_unlikely(&sk_false)); + + static_branch_enable(&sk_true); + static_branch_disable(&sk_false); + } + + return 0; +} +late_initcall(jump_label_test); +#endif /* STATIC_KEYS_SELFTEST */ + +#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kexec.c b/kernel/kexec.c index a785c1015e25..4c5edc357923 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,156 +1,22 @@ /* - * kexec.c - kexec system call + * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt - #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> -#include <linux/slab.h> -#include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/reboot.h> -#include <linux/ioport.h> -#include <linux/hardirq.h> -#include <linux/elf.h> -#include <linux/elfcore.h> -#include <linux/utsname.h> -#include <linux/numa.h> -#include <linux/suspend.h> -#include <linux/device.h> -#include <linux/freezer.h> -#include <linux/pm.h> -#include <linux/cpu.h> -#include <linux/console.h> #include <linux/vmalloc.h> -#include <linux/swap.h> -#include <linux/syscore_ops.h> -#include <linux/compiler.h> -#include <linux/hugetlb.h> - -#include <asm/page.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/sections.h> - -#include <crypto/hash.h> -#include <crypto/sha.h> - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Flag to indicate we are going to kexec a new kernel */ -bool kexec_in_progress = false; - -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - -#ifdef CONFIG_KEXEC_FILE -static int kexec_calculate_store_digests(struct kimage *image); -#endif - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in do_exit() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in <asm/kexec.h>. - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) +#include <linux/slab.h> -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); +#include "kexec_internal.h" static int copy_user_segment_list(struct kimage *image, unsigned long nr_segments, @@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image, return ret; } -static int sanity_check_segment_list(struct kimage *image) -{ - int result, i; - unsigned long nr_segments = image->nr_segments; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addresses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - return result; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - return result; - } - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - - if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; - } - } - - return 0; -} - -static struct kimage *do_kimage_alloc_init(void) -{ - struct kimage *image; - - /* Allocate a controlling structure */ - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - return NULL; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); - - return image; -} - -static void kimage_free_page_list(struct list_head *list); - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -354,873 +101,6 @@ out_free_image: return ret; } -#ifdef CONFIG_KEXEC_FILE -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -ENOEXEC; -} - -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return ERR_PTR(-ENOEXEC); -} - -void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ -} - -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -EKEYREJECTED; -} - -/* Apply relocations of type RELA */ -int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* Apply relocations of type REL */ -int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * Free up memory used by kernel, initrd, and command line. This is temporary - * memory allocation which is not needed any more after these buffers have - * been loaded into separate segments and have been copied elsewhere. - */ -static void kimage_file_post_load_cleanup(struct kimage *image) -{ - struct purgatory_info *pi = &image->purgatory_info; - - vfree(image->kernel_buf); - image->kernel_buf = NULL; - - vfree(image->initrd_buf); - image->initrd_buf = NULL; - - kfree(image->cmdline_buf); - image->cmdline_buf = NULL; - - vfree(pi->purgatory_buf); - pi->purgatory_buf = NULL; - - vfree(pi->sechdrs); - pi->sechdrs = NULL; - - /* See if architecture has anything to cleanup post load */ - arch_kimage_file_post_load_cleanup(image); - - /* - * Above call should have called into bootloader to free up - * any data stored in kimage->image_loader_data. It should - * be ok now to free it up. - */ - kfree(image->image_loader_data); - image->image_loader_data = NULL; -} - -/* - * In file mode list of segments is prepared by kernel. Copy relevant - * data from user space, do error checking, prepare segment list - */ -static int -kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, - const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned flags) -{ - int ret = 0; - void *ldata; - - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); - if (ret) - return ret; - - /* Call arch image probe handlers */ - ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, - image->kernel_buf_len); - - if (ret) - goto out; - -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); - goto out; - } - pr_debug("kernel signature verification successful.\n"); -#endif - /* It is possible that there no initramfs is being loaded */ - if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); - if (ret) - goto out; - } - - if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; - goto out; - } - - image->cmdline_buf_len = cmdline_len; - - /* command line should be a string with last byte null */ - if (image->cmdline_buf[cmdline_len - 1] != '\0') { - ret = -EINVAL; - goto out; - } - } - - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); - - if (IS_ERR(ldata)) { - ret = PTR_ERR(ldata); - goto out; - } - - image->image_loader_data = ldata; -out: - /* In case of error, free up all allocated memory in this function */ - if (ret) - kimage_file_post_load_cleanup(image); - return ret; -} - -static int -kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, - int initrd_fd, const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned long flags) -{ - int ret; - struct kimage *image; - bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; - - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->file_mode = 1; - - if (kexec_on_panic) { - /* Enable special crash kernel control page alloc policy. */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - } - - ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, - cmdline_ptr, cmdline_len, flags); - if (ret) - goto out_free_image; - - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_post_load_bufs; - - ret = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_post_load_bufs; - } - - if (!kexec_on_panic) { - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; - } - } - - *rimage = image; - return 0; -out_free_control_pages: - kimage_free_page_list(&image->control_pages); -out_free_post_load_bufs: - kimage_file_post_load_cleanup(image); -out_free_image: - kfree(image); - return ret; -} -#else /* CONFIG_KEXEC_FILE */ -static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -#endif /* CONFIG_KEXEC_FILE */ - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everything to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unusable_pages); - -} -static void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - - /* - * Free up any temporary buffers allocated. This might hit if - * error occurred much later after buffer allocation. - */ - if (image->file_mode) - kimage_file_post_load_cleanup(image); - - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unusable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - if (mchunk > uchunk) { - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; -int kexec_load_disabled; - -static DEFINE_MUTEX(kexec_mutex); SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) @@ -1340,18 +215,6 @@ out: return result; } -/* - * Add and remove page tables for crashkernel memory - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, nr_segments, @@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return sys_kexec_load(entry, nr_segments, ksegments, flags); } #endif - -#ifdef CONFIG_KEXEC_FILE -SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, - unsigned long, cmdline_len, const char __user *, cmdline_ptr, - unsigned long, flags) -{ - int ret = 0, i; - struct kimage **dest_image, *image; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) - return -EPERM; - - /* Make sure we have a legal set of flags */ - if (flags != (flags & KEXEC_FILE_FLAGS)) - return -EINVAL; - - image = NULL; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - - dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) - dest_image = &kexec_crash_image; - - if (flags & KEXEC_FILE_UNLOAD) - goto exchange; - - /* - * In case of crash, new kernel gets loaded in reserved region. It is - * same memory where old crash kernel might be loaded. Free any - * current crash dump kernel before we corrupt it. - */ - if (flags & KEXEC_FILE_ON_CRASH) - kimage_free(xchg(&kexec_crash_image, NULL)); - - ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, - cmdline_len, flags); - if (ret) - goto out; - - ret = machine_kexec_prepare(image); - if (ret) - goto out; - - ret = kexec_calculate_store_digests(image); - if (ret) - goto out; - - for (i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); - - ret = kimage_load_segment(image, &image->segment[i]); - if (ret) - goto out; - } - - kimage_terminate(image); - - /* - * Free up any temporary buffers allocated which are not needed - * after image has been loaded - */ - kimage_file_post_load_cleanup(image); -exchange: - image = xchg(dest_image, image); -out: - mutex_unlock(&kexec_mutex); - kimage_free(image); - return ret; -} - -#endif /* CONFIG_KEXEC_FILE */ - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -size_t crash_get_memory_size(void) -{ - size_t size = 0; - mutex_lock(&kexec_mutex); - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); - return size; -} - -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; - - mutex_lock(&kexec_mutex); - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_map_reserved_pages(); - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; - ram_res->name = "System RAM"; - - crashk_res.end = end - 1; - - insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); - -unlock: - mutex_unlock(&kexec_mutex); - return ret; -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __weak paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -#ifdef CONFIG_KEXEC_FILE -static int locate_mem_hole_top_down(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; - - do { - /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); - - if (temp_start < start || temp_start < kbuf->buf_min) - return 0; - - temp_end = temp_start + kbuf->memsz - 1; - - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start - PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_start = max(start, kbuf->buf_min); - - do { - temp_start = ALIGN(temp_start, kbuf->buf_align); - temp_end = temp_start + kbuf->memsz - 1; - - if (temp_end > end || temp_end > kbuf->buf_max) - return 0; - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start + PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) -{ - struct kexec_buf *kbuf = (struct kexec_buf *)arg; - unsigned long sz = end - start + 1; - - /* Returning 0 will take to next memory range */ - if (sz < kbuf->memsz) - return 0; - - if (end < kbuf->buf_min || start > kbuf->buf_max) - return 0; - - /* - * Allocate memory top down with-in ram range. Otherwise bottom up - * allocation. - */ - if (kbuf->top_down) - return locate_mem_hole_top_down(start, end, kbuf); - return locate_mem_hole_bottom_up(start, end, kbuf); -} - -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. - */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) -{ - - struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; - int ret; - - /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) - return -EINVAL; - - if (image->nr_segments >= KEXEC_SEGMENT_MAX) - return -EINVAL; - - /* - * Make sure we are not trying to add buffer after allocating - * control pages. All segments need to be placed first before - * any control pages are allocated. As control page allocation - * logic goes through list of segments to make sure there are - * no destination overlaps. - */ - if (!list_empty(&image->control_pages)) { - WARN_ON(1); - return -EINVAL; - } - - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; - - /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } - - /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; - ksegment->kbuf = kbuf->buffer; - ksegment->bufsz = kbuf->bufsz; - ksegment->mem = kbuf->mem; - ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; - return 0; -} - -/* Calculate and store the digest of segments */ -static int kexec_calculate_store_digests(struct kimage *image) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; - void *zero_buf; - struct kexec_sha_region *sha_regions; - struct purgatory_info *pi = &image->purgatory_info; - - zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); - zero_buf_sz = PAGE_SIZE; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); - sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) - goto out_free_desc; - - desc->tfm = tfm; - desc->flags = 0; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; - - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } - - for (j = i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - /* - * Skip purgatory as it will be modified once we put digest - * info in purgatory. - */ - if (ksegment->kbuf == pi->purgatory_buf) - continue; - - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; - - /* - * Assume rest of the buffer is filled with zero and - * update digest accordingly. - */ - nullsz = ksegment->memsz - ksegment->bufsz; - while (nullsz) { - unsigned long bytes = nullsz; - - if (bytes > zero_buf_sz) - bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; - nullsz -= bytes; - } - - if (ret) - break; - - sha_regions[j].start = ksegment->mem; - sha_regions[j].len = ksegment->memsz; - j++; - } - - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; - - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } - -out_free_digest: - kfree(digest); -out_free_sha_regions: - vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: - return ret; -} - -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) -{ - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; - - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; - - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - if (!sechdrs) - return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - - /* - * Identify entry point section and make entry relative to section - * start. - */ - entry = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - entry -= sechdrs[i].sh_addr; - break; - } - } - - /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; - bss_align = 1; - buf_sz = 0; - bss_sz = 0; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; - } else { - /* bss section */ - if (bss_align < align) - bss_align = align; - bss_sz = ALIGN(bss_sz, align); - bss_sz += sechdrs[i].sh_size; - } - } - - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); - - memsz = buf_sz + bss_pad + bss_sz; - - /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { - ret = -ENOMEM; - goto out; - } - - if (buf_align < bss_align) - buf_align = bss_align; - - /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); - if (ret) - goto out; - - /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { - bss_addr = ALIGN(bss_addr, align); - sechdrs[i].sh_addr = bss_addr; - bss_addr += sechdrs[i].sh_size; - } - } - - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; - - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; - - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; - - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = purgatory_buf; - return ret; -out: - vfree(sechdrs); - vfree(purgatory_buf); - return ret; -} - -static int kexec_apply_relocations(struct kimage *image) -{ - int i, ret; - struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; - - /* Apply relocations */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; - - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) - continue; - - /* - * For section of type SHT_RELA/SHT_REL, - * ->sh_link contains section header index of associated - * symbol table. And ->sh_info contains section header - * index of section to which relocations apply. - */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) - return -ENOEXEC; - - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; - - if (!(section->sh_flags & SHF_ALLOC)) - continue; - - /* - * symtab->sh_link contain section header index of associated - * string table. - */ - if (symtab->sh_link >= pi->ehdr->e_shnum) - /* Invalid section number? */ - continue; - - /* - * Respective architecture needs to provide support for applying - * relocations of type SHT_RELA/SHT_REL. - */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); - if (ret) - return ret; - } - - return 0; -} - -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) -{ - struct purgatory_info *pi = &image->purgatory_info; - int ret; - - if (kexec_purgatory_size <= 0) - return -EINVAL; - - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; - - ret = __kexec_load_purgatory(image, min, max, top_down); - if (ret) - return ret; - - ret = kexec_apply_relocations(image); - if (ret) - goto out; - - *load_addr = pi->purgatory_load_addr; - return 0; -out: - vfree(pi->sechdrs); - vfree(pi->purgatory_buf); - return ret; -} - -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - Elf_Sym *syms; - Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; - int i, k; - const char *strtab; - - if (!pi->sechdrs || !pi->ehdr) - return NULL; - - sechdrs = pi->sechdrs; - ehdr = pi->ehdr; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - -void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) -{ - struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; - Elf_Shdr *sechdr; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return ERR_PTR(-EINVAL); - - sechdr = &pi->sechdrs[sym->st_shndx]; - - /* - * Returns the address where symbol will finally be loaded after - * kexec_load_segment() - */ - return (void *)(sechdr->sh_addr + sym->st_value); -} - -/* - * Get or set value of a symbol. If "get_value" is true, symbol value is - * returned in buf otherwise symbol value is set based on value in buf. - */ -int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, - void *buf, unsigned int size, bool get_value) -{ - Elf_Sym *sym; - Elf_Shdr *sechdrs; - struct purgatory_info *pi = &image->purgatory_info; - char *sym_buf; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return -EINVAL; - - if (sym->st_size != size) { - pr_err("symbol %s size mismatch: expected %lu actual %u\n", - name, (unsigned long)sym->st_size, size); - return -EINVAL; - } - - sechdrs = pi->sechdrs; - - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - pr_err("symbol %s is in a bss section. Cannot %s\n", name, - get_value ? "get" : "set"); - return -EINVAL; - } - - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; - - if (get_value) - memcpy((void *)buf, sym_buf, size); - else - memcpy((void *)sym_buf, buf, size); - - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - lock_system_sleep(); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_end(PMSG_FREEZE); - if (error) - goto Resume_devices; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - local_irq_disable(); - error = syscore_suspend(); - if (error) - goto Enable_irqs; - } else -#endif - { - kexec_in_progress = true; - kernel_restart_prepare(NULL); - migrate_to_reboot_cpu(); - - /* - * migrate_to_reboot_cpu() disables CPU hotplug assuming that - * no further code needs to use CPU hotplug (which is true in - * the reboot case). However, the kexec path depends on using - * CPU hotplug again; so re-enable it here. - */ - cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - syscore_resume(); - Enable_irqs: - local_irq_enable(); - Enable_cpus: - enable_nonboot_cpus(); - dpm_resume_start(PMSG_RESTORE); - Resume_devices: - dpm_resume_end(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - unlock_system_sleep(); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c new file mode 100644 index 000000000000..201b45327804 --- /dev/null +++ b/kernel/kexec_core.c @@ -0,0 +1,1534 @@ +/* + * kexec.c - kexec system call core code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/utsname.h> +#include <linux/numa.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/freezer.h> +#include <linux/pm.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/console.h> +#include <linux/vmalloc.h> +#include <linux/swap.h> +#include <linux/syscore_ops.h> +#include <linux/compiler.h> +#include <linux/hugetlb.h> + +#include <asm/page.h> +#include <asm/sections.h> + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include "kexec_internal.h" + +DEFINE_MUTEX(kexec_mutex); + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); + +int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addresses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + return result; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + return result; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + return result; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + return result; + } + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; +} + +int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + + pages->mapping = NULL; + set_page_private(pages, order); + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page_private(page); + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everything to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + image->control_page = hole_end; + break; + } + } + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unusable pages I have cached */ + kimage_free_page_list(&image->unusable_pages); + +} +void kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return NULL; +} + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return NULL; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unusable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it if it's + * gfp_flags honor the ones passed in. + */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } + addr = old_addr; + page = old_page; + break; + } + /* Place the page on the destination list, to be used later */ + list_add(&page->lru, &image->dest_pages); + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (!page) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + clear_page(ptr); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (!page) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +struct kimage *kexec_image; +struct kimage *kexec_crash_image; +int kexec_load_disabled; + +void crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_mutex here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + mutex_unlock(&kexec_mutex); + } +} + +size_t crash_get_memory_size(void) +{ + size_t size = 0; + + mutex_lock(&kexec_mutex); + if (crashk_res.end != crashk_res.start) + size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); + return size; +} + +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); + end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); + + crash_map_reserved_pages(); + crash_free_reserved_phys_range(end, crashk_res.end); + + if ((start == end) && (crashk_res.parent != NULL)) + release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); + crash_unmap_reserved_pages(); + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + size_t size, align; + + /* + * crash_notes could be allocated across 2 vmalloc pages when percpu + * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc + * pages are also on 2 continuous physical pages. In this case the + * 2nd part of crash_notes in 2nd page could be lost since only the + * starting address and size of crash_notes are exported through sysfs. + * Here round up the size of crash_notes to the nearest power of two + * and pass it to __alloc_percpu as align value. This can make sure + * crash_notes is allocated inside one physical page. + */ + size = sizeof(note_buf_t); + align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); + + /* + * Break compile if size is bigger than PAGE_SIZE since crash_notes + * definitely will be in 2 pages with that. + */ + BUILD_BUG_ON(size > PAGE_SIZE); + + crash_notes = __alloc_percpu(size, align); + if (!crash_notes) { + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + return -ENOMEM; + } + return 0; +} +subsys_initcall(crash_notes_memory_init); + + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +unsigned long __weak paddr_vmcoreinfo_note(void) +{ + return __pa((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_kexec_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_X86 + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); +#endif +#ifdef CONFIG_HUGETLBFS + VMCOREINFO_SYMBOL(free_huge_page); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + lock_system_sleep(); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Resume_console; + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); + error = syscore_suspend(); + if (error) + goto Enable_irqs; + } else +#endif + { + kexec_in_progress = true; + kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); + pr_emerg("Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + syscore_resume(); + Enable_irqs: + local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + dpm_resume_start(PMSG_RESTORE); + Resume_devices: + dpm_resume_end(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + unlock_system_sleep(); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +} + +/* + * Add and remove page tables for crashkernel memory + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +void __weak crash_map_reserved_pages(void) +{} + +void __weak crash_unmap_reserved_pages(void) +{} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c new file mode 100644 index 000000000000..6a9a3f2a0e8e --- /dev/null +++ b/kernel/kexec_file.c @@ -0,0 +1,1045 @@ +/* + * kexec: kexec_file_load system call + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <linux/syscalls.h> +#include <linux/vmalloc.h> +#include "kexec_internal.h" + +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + return -EINVAL; +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and command line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: + kfree(image); + return ret; +} + +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective architecture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h new file mode 100644 index 000000000000..e4392a698ad4 --- /dev/null +++ b/kernel/kexec_internal.h @@ -0,0 +1,22 @@ +#ifndef LINUX_KEXEC_INTERNAL_H +#define LINUX_KEXEC_INTERNAL_H + +#include <linux/kexec.h> + +struct kimage *do_kimage_alloc_init(void); +int sanity_check_segment_list(struct kimage *image); +void kimage_free_page_list(struct list_head *list); +void kimage_free(struct kimage *image); +int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +void kimage_terminate(struct kimage *image); +int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); + +extern struct mutex kexec_mutex; + +#ifdef CONFIG_KEXEC_FILE +void kimage_file_post_load_cleanup(struct kimage *image); +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ +#endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 2777f40a9c7b..da98d0593de2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ extern int max_threads; -static struct workqueue_struct *khelper_wq; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -114,10 +112,11 @@ out: * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns - * zero on success or a negative errno code on failure. Note that a - * successful module load does not mean the module did not then unload - * and exit on an error of its own. Callers must check that the service - * they requested is now available not blindly invoke it. + * zero on success or a negative errno code or positive exit code from + * "modprobe" on failure. Note that a successful module load does not mean + * the module did not then unload and exit on an error of its own. Callers + * must check that the service they requested is now available not blindly + * invoke it. * * If module auto-loading support is disabled then this function * becomes a no-operation. @@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info) /* * This is the task which runs the usermode application */ -static int ____call_usermodehelper(void *data) +static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; @@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); - /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, cpu_all_mask); - /* - * Our parent is keventd, which runs with elevated scheduling priority. - * Avoid propagating that into the userspace child. + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); @@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data) (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; - /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) @@ -266,15 +265,14 @@ out: do_exit(0); } -/* Keventd can't block, but this (a child) can. */ -static int wait_for_helper(void *data) +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { - struct subprocess_info *sub_info = data; pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; } else { @@ -282,44 +280,60 @@ static int wait_for_helper(void *data) /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. - * But wait_for_helper() always runs as keventd, and put_user() - * to a kernel address works OK for kernel threads, due to their - * having an mm_segment_t which spans the entire address space. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. * * Thus the __user pointer cast is valid here. */ sys_wait4(pid, (int __user *)&ret, 0, NULL); /* - * If ret is 0, either ____call_usermodehelper failed and the - * real error code is already in sub_info->retval or + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or * sub_info->retval is 0 anyway, so don't mess with it then. */ if (ret) sub_info->retval = ret; } + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + umh_complete(sub_info); - do_exit(0); } -/* This is run by khelper thread */ -static void __call_usermodehelper(struct work_struct *work) +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - pid_t pid; - if (sub_info->wait & UMH_WAIT_PROC) - pid = kernel_thread(wait_for_helper, sub_info, - CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, - SIGCHLD); + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } } } @@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, if (!sub_info) goto out; - INIT_WORK(&sub_info->work, __call_usermodehelper); + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; @@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * from interrupt context. * * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of keventd. - * (ie. it runs with full root capabilities). + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { @@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) return -EINVAL; } helper_lock(); - if (!khelper_wq || usermodehelper_disabled) { + if (usermodehelper_disabled) { retval = -EBUSY; goto out; } @@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; - queue_work(khelper_wq, &sub_info->work); + queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; @@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = { }, { } }; - -void __init usermodehelper_init(void) -{ - khelper_wq = create_singlethread_workqueue("khelper"); - BUG_ON(!khelper_wq); -} diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6683ccef9fff..e83b26464061 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_KEXEC_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 490924cc9e7c..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. - * @node: memory node number. + * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which no one will call kthread_stop(), or diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..8e96f6cc2a4a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..f17a3e3b3550 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } @@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* @@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock) */ arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; @@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) struct __qrwlock *l = (struct __qrwlock *)lock; if (!READ_ONCE(l->wmode) && - (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); @@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock) unlock: arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..337c8818541d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_kick_node(lock, next); release: /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index df19ae4debd0..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,9 +22,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. * - * This should be fine however, kicking people for no reason is - * harmless. + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. * - * See the comment in pv_wait_node(). + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct qspinlock **lp = NULL; int loop; + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + for (;;) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) @@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); if (!lp) { /* ONCE */ + WRITE_ONCE(pn->state, vcpu_hashed); lp = pv_hash(lock, pn); + /* - * lp must be set before setting _Q_SLOW_VAL + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. * - * [S] lp = lock [RmW] l = l->locked = 0 - * MB MB - * [S] l->locked = _Q_SLOW_VAL [L] lp + * [S] pn->state + * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> + * [L] pn->state * - * Matches the cmpxchg() in __pv_queued_spin_unlock(). + * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { /* @@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + u8 locked; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(lockval == _Q_LOCKED_VAL)) + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) return; - if (unlikely(lockval != _Q_SLOW_VAL)) { - if (debug_locks_silent) - return; - WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); return; } /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); + + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. */ @@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) + if (READ_ONCE(node->state) == vcpu_hashed) pv_kick(node->cpu); } /* diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/device.h> -#include <linux/kthread.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/freezer.h> -#include <linux/stat.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..7781d801212f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); raw_spin_lock(&lock->wait_lock); set_current_state(state); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,28 +15,6 @@ #include <linux/rtmutex.h> /* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * diff --git a/kernel/memremap.c b/kernel/memremap.c new file mode 100644 index 000000000000..72b0c66628b6 --- /dev/null +++ b/kernel/memremap.c @@ -0,0 +1,190 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: either MEMREMAP_WB or MEMREMAP_WT + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. + * + * MEMREMAP_WB - matches the default mapping for "System RAM" on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map "System RAM" with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, "System RAM"); + void *addr = NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + flags &= ~MEMREMAP_WB; + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in "System RAM" + */ + if (is_ram == REGION_INTERSECTS) + addr = __va(offset); + else + addr = ioremap_cache(offset, size); + } + + /* + * If we don't have a mapping yet and more request flags are + * pending then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * "System RAM" + */ + if (!addr && is_ram == REGION_INTERSECTS && flags) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) { + flags &= ~MEMREMAP_WT; + addr = ioremap_wt(offset, size); + } + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, + addr)); + memunmap(addr); +} +EXPORT_SYMBOL(devm_memunmap); + +#ifdef CONFIG_ZONE_DEVICE +struct page_map { + struct resource res; +}; + +static void devm_memremap_pages_release(struct device *dev, void *res) +{ + struct page_map *page_map = res; + + /* pages are dead and unused, undo the arch mapping */ + arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); +} + +void *devm_memremap_pages(struct device *dev, struct resource *res) +{ + int is_ram = region_intersects(res->start, resource_size(res), + "System RAM"); + struct page_map *page_map; + int error, nid; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, res); + return ERR_PTR(-ENXIO); + } + + if (is_ram == REGION_INTERSECTS) + return __va(res->start); + + page_map = devres_alloc(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL); + if (!page_map) + return ERR_PTR(-ENOMEM); + + memcpy(&page_map->res, res, sizeof(*res)); + + nid = dev_to_node(dev); + if (nid < 0) + nid = 0; + + error = arch_add_memory(nid, res->start, resource_size(res), true); + if (error) { + devres_free(page_map); + return ERR_PTR(error); + } + + devres_add(dev, page_map); + return __va(res->start); +} +EXPORT_SYMBOL(devm_memremap_pages); +#endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fac4bd0..bd62f5cda746 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -10,11 +10,8 @@ */ #include <linux/kernel.h> -#include <linux/err.h> -#include <crypto/public_key.h> -#include <crypto/hash.h> -#include <keys/asymmetric-type.h> #include <keys/system_keyring.h> +#include <crypto/public_key.h> #include "module-internal.h" /* @@ -28,170 +25,22 @@ * - Information block */ struct module_signature { - u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ - u8 id_type; /* Key identifier type [enum pkey_id_type] */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; __be32 sig_len; /* Length of signature data */ }; /* - * Digest the module contents. - */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, - const void *mod, - unsigned long modlen) -{ - struct public_key_signature *pks; - struct crypto_shash *tfm; - struct shash_desc *desc; - size_t digest_size, desc_size; - int ret; - - pr_devel("==>%s()\n", __func__); - - /* Allocate the hashing algorithm we're going to need and find out how - * big the hash operational data will be. - */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); - if (IS_ERR(tfm)) - return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - digest_size = crypto_shash_digestsize(tfm); - - /* We allocate the hash operational data storage on the end of our - * context data and the digest output buffer on the end of that. - */ - ret = -ENOMEM; - pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); - if (!pks) - goto error_no_pks; - - pks->pkey_hash_algo = hash; - pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; - pks->digest_size = digest_size; - - desc = (void *)pks + sizeof(*pks); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto error; - - ret = crypto_shash_finup(desc, mod, modlen, pks->digest); - if (ret < 0) - goto error; - - crypto_free_shash(tfm); - pr_devel("<==%s() = ok\n", __func__); - return pks; - -error: - kfree(pks); -error_no_pks: - crypto_free_shash(tfm); - pr_devel("<==%s() = %d\n", __func__, ret); - return ERR_PTR(ret); -} - -/* - * Extract an MPI array from the signature data. This represents the actual - * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the - * size of the MPI in bytes. - * - * RSA signatures only have one MPI, so currently we only read one. - */ -static int mod_extract_mpi_array(struct public_key_signature *pks, - const void *data, size_t len) -{ - size_t nbytes; - MPI mpi; - - if (len < 3) - return -EBADMSG; - nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; - data += 2; - len -= 2; - if (len != nbytes) - return -EBADMSG; - - mpi = mpi_read_raw_data(data, nbytes); - if (!mpi) - return -ENOMEM; - pks->mpi[0] = mpi; - pks->nr_mpi = 1; - return 0; -} - -/* - * Request an asymmetric key. - */ -static struct key *request_asymmetric_key(const char *signer, size_t signer_len, - const u8 *key_id, size_t key_id_len) -{ - key_ref_t key; - size_t i; - char *id, *q; - - pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); - - /* Construct an identifier. */ - id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); - if (!id) - return ERR_PTR(-ENOKEY); - - memcpy(id, signer, signer_len); - - q = id + signer_len; - *q++ = ':'; - *q++ = ' '; - for (i = 0; i < key_id_len; i++) { - *q++ = hex_asc[*key_id >> 4]; - *q++ = hex_asc[*key_id++ & 0x0f]; - } - - *q = 0; - - pr_debug("Look up: \"%s\"\n", id); - - key = keyring_search(make_key_ref(system_trusted_keyring, 1), - &key_type_asymmetric, id); - if (IS_ERR(key)) - pr_warn("Request for unknown module key '%s' err %ld\n", - id, PTR_ERR(key)); - kfree(id); - - if (IS_ERR(key)) { - switch (PTR_ERR(key)) { - /* Hide some search errors */ - case -EACCES: - case -ENOTDIR: - case -EAGAIN: - return ERR_PTR(-ENOKEY); - default: - return ERR_CAST(key); - } - } - - pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); - return key_ref_to_ptr(key); -} - -/* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long *_modlen) { - struct public_key_signature *pks; struct module_signature ms; - struct key *key; - const void *sig; size_t modlen = *_modlen, sig_len; - int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - if ((size_t)ms.signer_len + ms.key_id_len >= modlen) - return -EBADMSG; - modlen -= (size_t)ms.signer_len + ms.key_id_len; - *_modlen = modlen; - sig = mod + modlen; - - /* For the moment, only support RSA and X.509 identifiers */ - if (ms.algo != PKEY_ALGO_RSA || - ms.id_type != PKEY_ID_X509) - return -ENOPKG; - if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + if (ms.id_type != PKEY_ID_PKCS7) { + pr_err("Module is not signed with expected PKCS#7 message\n"); return -ENOPKG; - - key = request_asymmetric_key(sig, ms.signer_len, - sig + ms.signer_len, ms.key_id_len); - if (IS_ERR(key)) - return PTR_ERR(key); - - pks = mod_make_digest(ms.hash, mod, modlen); - if (IS_ERR(pks)) { - ret = PTR_ERR(pks); - goto error_put_key; } - ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, - sig_len); - if (ret < 0) - goto error_free_pks; - - ret = verify_signature(key, pks); - pr_devel("verify_signature() = %d\n", ret); + if (ms.algo != 0 || + ms.hash != 0 || + ms.signer_len != 0 || + ms.key_id_len != 0 || + ms.__pad[0] != 0 || + ms.__pad[1] != 0 || + ms.__pad[2] != 0) { + pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + return -EBADMSG; + } -error_free_pks: - mpi_free(pks->rsa.s); - kfree(pks); -error_put_key: - key_put(key); - pr_devel("<==%s() = %d\n", __func__, ret); - return ret; + return system_verify_data(mod, modlen, mod + modlen, sig_len, + VERIFYING_MODULE_SIGNATURE); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2f30ca91e4fa..b2066fb5b10f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb) hb->error = 0; } -static void hib_end_io(struct bio *bio, int error) +static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate || error) { + if (bio->bi_error) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_iter.bi_sector); - - if (!error) - error = -EIO; } if (bio_data_dir(bio) == WRITE) put_page(page); - if (error && !hb->error) - hb->error = error; + if (bio->bi_error && !hb->error) + hb->error = bio->bi_error; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cf8c24203368..8f0324ef72ab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * This appends the listed symbols to /proc/vmcore * diff --git a/kernel/profile.c b/kernel/profile.c index a7bcd28d6e9f..99513e1160e5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -543,14 +543,14 @@ static int create_hash_tables(void) int node = cpu_to_mem(cpu); struct page *page; - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..787320de68e0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); diff --git a/kernel/reboot.c b/kernel/reboot.c index d20c85d9f8c0..bd30a973fe94 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, kernel_restart(buffer); break; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE case LINUX_REBOOT_CMD_KEXEC: ret = kernel_kexec(); break; diff --git a/kernel/resource.c b/kernel/resource.c index fed052a1bc9f..f150dbbe6f62 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -/* - * Search for a resouce entry that fully contains the specified region. - * If found, return 1 if it is RAM, 0 if not. - * If not found, or region is not fully contained, return -1 +/** + * region_intersects() - determine intersection of region with known resources + * @start: region start address + * @size: size of region + * @name: name of resource (in iomem_resource) * - * Used by the ioremap functions to ensure the user is not remapping RAM and is - * a vast speed up over walking through the resource table page by page. + * Check if the specified region partially overlaps or fully eclipses a + * resource identified by @name. Return REGION_DISJOINT if the region + * does not overlap @name, return REGION_MIXED if the region overlaps + * @type and another resource, and return REGION_INTERSECTS if the + * region overlaps @type and no other defined resource. Note, that + * REGION_INTERSECTS is also returned in the case when the specified + * region overlaps RAM and undefined memory holes. + * + * region_intersect() is used by memory remapping functions to ensure + * the user is not remapping RAM and is a vast speed up over walking + * through the resource table page by page. */ -int region_is_ram(resource_size_t start, unsigned long size) +int region_intersects(resource_size_t start, size_t size, const char *name) { - struct resource *p; - resource_size_t end = start + size - 1; unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - const char *name = "System RAM"; - int ret = -1; + resource_size_t end = start + size - 1; + int type = 0; int other = 0; + struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (p->end < start) - continue; - - if (p->start <= start && end <= p->end) { - /* resource fully contains region */ - if ((p->flags != flags) || strcmp(p->name, name)) - ret = 0; - else - ret = 1; - break; - } - if (end < p->start) - break; /* not found */ + bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + + if (start >= p->start && start <= p->end) + is_type ? type++ : other++; + if (end >= p->start && end <= p->end) + is_type ? type++ : other++; + if (p->start >= start && p->end <= end) + is_type ? type++ : other++; } read_unlock(&resource_lock); - return ret; + + if (other == 0) + return type ? REGION_INTERSECTS : REGION_DISJOINT; + + if (type) + return REGION_MIXED; + + return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b864ecee0e1..3595403921bd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -8133,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task) +static void cpu_cgroup_fork(struct task_struct *task, void *private) { sched_move_task(task); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..272d9322bc5d 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, + void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(q, mode, nr, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); + __wake_up_locked_key(q, mode, 1, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 245df6b32b81..5bd4779282df 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; @@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (mode == 0) + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); @@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..a818cbc73e14 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); preempt_enable(); - if (ht->cleanup) + /* cleanup must mirror setup */ + if (ht->cleanup && td->status != HP_THREAD_NONE) ht->cleanup(td->cpu, cpu_online(td->cpu)); kfree(td); return 0; @@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; - /* Unpark any threads that were voluntarily parked. */ - for_each_cpu_not(cpu, ht->cpumask) { - if (cpu_online(cpu)) { - struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) - kthread_unpark(tsk); - } - } - /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * to hotplug * @plug_thread: Hotplug thread descriptor + * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, + const struct cpumask *cpumask) { unsigned int cpu; int ret = 0; if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + cpumask_copy(plug_thread->cpumask, cpumask); get_online_cpus(); mutex_lock(&smpboot_threads_lock); @@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); + free_cpumask_var(plug_thread->cpumask); goto out; } - smpboot_unpark_thread(plug_thread, cpu); + if (cpumask_test_cpu(cpu, cpumask)) + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -311,7 +308,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ca7d84f438f1..03c3875d9958 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 19b62b522158..e69201d8094e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, @@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47535..000000000000 --- a/kernel/system_certificates.S +++ /dev/null @@ -1,20 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): -__cert_list_start: - .incbin "kernel/x509_certificate_list" -__cert_list_end: - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list_size) -VMLINUX_SYMBOL(system_certificate_list_size): -#ifdef CONFIG_64BIT - .quad __cert_list_end - __cert_list_start -#else - .long __cert_list_end - __cert_list_start -#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 875f64e8935b..000000000000 --- a/kernel/system_keyring.c +++ /dev/null @@ -1,106 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const unsigned long system_certificate_list_size; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - p = system_certificate_list; - end = p + system_certificate_list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/task_work.c b/kernel/task_work.c index 8727032e3a6f..53fa971d000d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * + * Note: there is no ordering guarantee on works queued here. + * * RETURNS: * 0 if succeeds or -ESRCH. */ @@ -108,16 +110,6 @@ void task_work_run(void) raw_spin_unlock_wait(&task->pi_lock); smp_mb(); - /* Reverse the list to run the works in fifo order */ - head = NULL; - do { - next = work->next; - work->next = head; - head = work; - work = next; - } while (work); - - work = head; do { next = work->next; work->func(work); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b3e6b39b6cf9..90e72a0c3047 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - if (!error && !bio_flagged(bio, BIO_UPTODATE)) - error = EIO; - __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, what, error, 0, NULL); } @@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu); } } @@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + sizeof(r), &r); } /** diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..0fe96c7c8803 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = { /* * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) { char *fmt = (char *) (long) r1; + bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; + u64 unsafe_addr; + char buf[64]; int i; /* @@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p') { + } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; i++; if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) return -EINVAL; fmt_cnt++; + if (fmt[i - 1] == 's') { + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + switch (fmt_cnt) { + case 1: + unsafe_addr = r3; + r3 = (long) buf; + break; + case 2: + unsafe_addr = r4; + r4 = (long) buf; + break; + case 3: + unsafe_addr = r5; + r5 = (long) buf; + break; + } + buf[0] = 0; + strncpy_from_unsafe(buf, + (void *) (long) unsafe_addr, + sizeof(buf)); + } continue; } @@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; default: return NULL; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb11011b5292..b0623ac785a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..fc347f8b1bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -400,6 +400,17 @@ struct rb_irq_work { }; /* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + return skip_time_extend(event); +} - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; +static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - if (!tail) + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; delta = 0; + } - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} - /* We reserved something on the buffer */ +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + /* zero length can cause confusions */ + if (!length) + length++; - local_inc(&tail_page->entries); + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* - * If this is the first commit on the page, then update - * its timestamp. + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). */ - if (!tail) - tail_page->page->time_stamp = ts; + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + return length; +} - return event; +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; } +#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } } -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) +static inline void rb_event_discard(struct ring_buffer_event *event) { - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); - rb_start_commit(cpu_buffer); + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! + * The event first in the commit queue updates the + * time stamp. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} - /* make sure this diff is calculated here */ - barrier(); +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } - return event; + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - out_fail: - rb_end_commit(cpu_buffer); - return NULL; + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /* @@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; + + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - info->length; + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + info->delta = 0; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = info->ts; + + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + info.length = rb_calculate_event_length(length); + again: + info.add_timestamp = 0; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); + } + + event = __rb_reserve_next(cpu_buffer, &info); + + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +/** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) @@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..7ca09cdc20c2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -30,6 +30,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + field = __find_event_field(&ftrace_common_fields, name); if (field) return field; @@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, cpu, FILTER_OTHER); + __generic_field(char *, comm, FILTER_PTR_STRING); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -2671,6 +2693,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..bd1bf184c5c9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, if (is_string_field(field)) { filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) { + if (!strcmp(field->name, "comm")) { + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (!strcmp(field->name, "cpu")) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f65a0a06a8c0..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> +#include <linux/kthread.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static void restart_watchdog_hrtimer(void *info) +/* + * park all watchdog threads that are specified in 'watchdog_cpumask' + */ +static int watchdog_park_threads(void) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); - int ret; + int cpu, ret = 0; + get_online_cpus(); + for_each_watchdog_cpu(cpu) { + ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (ret) + break; + } + if (ret) { + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } + put_online_cpus(); + + return ret; +} + +/* + * unpark all watchdog threads that are specified in 'watchdog_cpumask' + */ +static void watchdog_unpark_threads(void) +{ + int cpu; + + get_online_cpus(); + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + put_online_cpus(); +} + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + mutex_lock(&watchdog_proc_mutex); /* - * No need to cancel and restart hrtimer if it is currently executing - * because it will reprogram itself with the new period now. - * We should never see it unqueued here because we are running per-cpu - * with interrupts disabled. + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). */ - ret = hrtimer_try_to_cancel(hrtimer); - if (ret == 1) - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + + mutex_unlock(&watchdog_proc_mutex); + + return ret; } -static void update_watchdog(int cpu) +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) { + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; /* - * Make sure that perf event counter will adopt to a new - * sampling period. Updating the sampling period directly would - * be much nicer but we do not have an API for that now so - * let's use a big hammer. - * Hrtimer will adopt the new period on the next tick but this - * might be late already so we have to restart the timer as well. + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. */ - watchdog_nmi_disable(cpu); - smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); - watchdog_nmi_enable(cpu); + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); } static void update_watchdog_all_cpus(void) { - int cpu; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - update_watchdog(cpu); - put_online_cpus(); + watchdog_park_threads(); + watchdog_unpark_threads(); } static int watchdog_enable_all_cpus(void) @@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void) int err = 0; if (!watchdog_running) { - err = smpboot_register_percpu_thread(&watchdog_threads); + err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_cpumask); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else { - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask)) - pr_err("Failed to set cpumask for watchdog threads\n"); + else watchdog_running = 1; - } } else { /* * Enable/disable the lockup detectors or @@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; mutex_lock(&watchdog_proc_mutex); + + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, pr_err("cpumask update failed\n"); } } +out: mutex_unlock(&watchdog_proc_mutex); return err; } @@ -932,10 +975,8 @@ void __init lockup_detector_init(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { - if (!cpumask_empty(tick_nohz_full_mask)) - pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, - tick_nohz_full_mask); + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_copy(&watchdog_cpumask, housekeeping_mask); } else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 811edb77dd6d..ca71582fcfab 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2612,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue |